1) Checking the dataset¶
from sklearn.datasets import load_iris
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.patches as mpatches
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
import warnings
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
Q1. Get the general data storage with the info and describe methods.¶
df = pd.read_excel("DataSet.xlsx")
df
| CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | MEDV | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.00632 | 18.0 | 2.31 | 0.0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296.0 | 15.3 | 396.90 | 4.98 | 24.0 |
| 1 | 0.02731 | 0.0 | 7.07 | 0.0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242.0 | 17.8 | 396.90 | 9.14 | 21.6 |
| 2 | 0.02729 | 0.0 | 7.07 | 0.0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242.0 | 17.8 | 392.83 | 4.03 | 34.7 |
| 3 | 0.03237 | 0.0 | 2.18 | 0.0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3 | 222.0 | 18.7 | NaN | 2.94 | 33.4 |
| 4 | 0.06905 | 0.0 | 2.18 | 0.0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3 | 222.0 | 18.7 | 396.90 | 5.33 | 36.2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 501 | 0.06263 | 0.0 | 11.93 | 0.0 | 0.573 | 6.593 | 69.1 | 2.4786 | 1 | 273.0 | 21.0 | 391.99 | 9.67 | 22.4 |
| 502 | 0.04527 | 0.0 | 11.93 | 0.0 | 0.573 | 6.120 | 76.7 | 2.2875 | 1 | 273.0 | 21.0 | 396.90 | 9.08 | 20.6 |
| 503 | 0.06076 | 0.0 | 11.93 | 0.0 | 0.573 | 6.976 | 91.0 | 2.1675 | 1 | 273.0 | 21.0 | 396.90 | 5.64 | 23.9 |
| 504 | 0.10959 | 0.0 | 11.93 | 0.0 | 0.573 | 6.794 | 89.3 | 2.3889 | 1 | 273.0 | 21.0 | 393.45 | 6.48 | 22.0 |
| 505 | 0.04741 | 0.0 | 11.93 | 0.0 | 0.573 | 6.030 | 80.8 | 2.5050 | 1 | 273.0 | 21.0 | 396.90 | 7.88 | 11.9 |
506 rows × 14 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 506 entries, 0 to 505 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CRIM 506 non-null float64 1 ZN 506 non-null float64 2 INDUS 506 non-null float64 3 CHAS 480 non-null float64 4 NOX 506 non-null float64 5 RM 506 non-null float64 6 AGE 506 non-null float64 7 DIS 479 non-null float64 8 RAD 506 non-null int64 9 TAX 506 non-null float64 10 PTRATIO 506 non-null float64 11 B 486 non-null float64 12 LSTAT 506 non-null float64 13 MEDV 452 non-null float64 dtypes: float64(13), int64(1) memory usage: 55.5 KB
df.describe()
| CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | MEDV | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 506.000000 | 506.000000 | 506.000000 | 480.000000 | 506.000000 | 506.000000 | 506.000000 | 479.000000 | 506.000000 | 506.000000 | 506.000000 | 486.000000 | 506.000000 | 452.000000 |
| mean | 1.269195 | 13.295257 | 9.205158 | 0.175000 | 1.101175 | 15.679800 | 58.744660 | 6.211663 | 78.063241 | 339.317787 | 42.614980 | 336.820947 | 11.537806 | 23.750442 |
| std | 2.399207 | 23.048697 | 7.169630 | 0.380364 | 1.646991 | 27.220206 | 33.104049 | 6.527286 | 203.542157 | 180.670077 | 87.585243 | 121.174519 | 6.064932 | 8.808602 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.385000 | 3.561000 | 1.137000 | 1.129600 | 1.000000 | 20.200000 | 2.600000 | 0.320000 | 1.730000 | 6.300000 |
| 25% | 0.049443 | 0.000000 | 3.440000 | 0.000000 | 0.449000 | 5.961500 | 32.000000 | 2.425900 | 4.000000 | 254.000000 | 17.000000 | 370.415000 | 6.877500 | 18.500000 |
| 50% | 0.144655 | 0.000000 | 6.960000 | 0.000000 | 0.538000 | 6.322500 | 65.250000 | 3.917500 | 5.000000 | 307.000000 | 18.900000 | 390.885000 | 10.380000 | 21.950000 |
| 75% | 0.819623 | 18.100000 | 18.100000 | 0.000000 | 0.647000 | 6.949000 | 89.975000 | 6.341400 | 24.000000 | 403.000000 | 20.200000 | 395.630000 | 15.015000 | 26.600000 |
| max | 9.966540 | 100.000000 | 27.740000 | 1.000000 | 7.313000 | 100.000000 | 100.000000 | 24.000000 | 666.000000 | 711.000000 | 396.900000 | 396.900000 | 34.410000 | 50.000000 |
with pd.option_context('display.max_rows', None,
'display.max_columns', None,
'display.precision', 3,
):
print(df)
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \
0 0.006 18.0 2.31 0.0 0.538 6.575 65.200 4.090 1 296.0
1 0.027 0.0 7.07 0.0 0.469 6.421 78.900 4.967 2 242.0
2 0.027 0.0 7.07 0.0 0.469 7.185 61.100 4.967 2 242.0
3 0.032 0.0 2.18 0.0 0.458 6.998 45.800 6.062 3 222.0
4 0.069 0.0 2.18 0.0 0.458 7.147 54.200 6.062 3 222.0
5 0.030 0.0 2.18 0.0 0.458 6.430 58.700 6.062 3 222.0
6 0.088 12.5 7.87 0.0 0.524 6.012 66.600 5.561 5 311.0
7 0.145 12.5 7.87 0.0 0.524 6.172 96.100 5.950 5 311.0
8 0.211 12.5 7.87 0.0 0.524 5.631 100.000 6.082 5 311.0
9 0.170 12.5 7.87 0.0 0.524 6.004 85.900 6.592 5 311.0
10 0.225 12.5 7.87 0.0 0.524 6.377 94.300 6.347 5 311.0
11 0.117 12.5 7.87 0.0 0.524 6.009 82.900 6.227 5 311.0
12 0.094 12.5 7.87 0.0 0.524 5.889 39.000 5.451 5 311.0
13 0.630 0.0 8.14 0.0 0.538 5.949 61.800 4.707 4 307.0
14 0.638 0.0 8.14 0.0 0.538 6.096 84.500 4.462 4 307.0
15 0.627 0.0 8.14 0.0 0.538 5.834 56.500 NaN 4 307.0
16 1.054 0.0 8.14 0.0 0.538 5.935 29.300 4.499 4 307.0
17 0.784 0.0 8.14 0.0 0.538 5.990 81.700 4.258 4 307.0
18 0.803 0.0 8.14 0.0 0.538 5.456 36.600 3.796 4 307.0
19 0.726 0.0 8.14 0.0 0.538 5.727 69.500 3.796 4 307.0
20 1.252 0.0 8.14 0.0 0.538 5.570 98.100 3.798 4 307.0
21 0.852 0.0 8.14 0.0 0.538 5.965 89.200 4.012 4 307.0
22 1.232 0.0 8.14 0.0 0.538 6.142 91.700 3.977 4 307.0
23 0.988 0.0 8.14 0.0 0.538 5.813 100.000 4.095 4 307.0
24 0.750 0.0 8.14 0.0 0.538 5.924 94.100 4.400 4 307.0
25 0.841 0.0 8.14 0.0 0.538 5.599 85.700 4.455 4 307.0
26 0.672 0.0 8.14 0.0 0.538 5.813 90.300 4.682 4 307.0
27 0.956 0.0 8.14 0.0 0.538 6.047 88.800 4.453 4 307.0
28 0.773 0.0 8.14 0.0 0.538 6.495 94.400 4.455 4 307.0
29 1.002 0.0 8.14 0.0 0.538 6.674 87.300 4.239 4 307.0
30 1.131 0.0 8.14 0.0 0.538 5.713 94.100 4.233 4 307.0
31 1.355 0.0 8.14 0.0 0.538 6.072 100.000 4.175 4 307.0
32 1.388 0.0 8.14 0.0 0.538 5.950 82.000 3.990 4 307.0
33 1.152 0.0 8.14 0.0 0.538 5.701 95.000 3.787 4 307.0
34 1.613 0.0 8.14 0.0 0.538 6.096 96.900 3.760 4 307.0
35 0.064 0.0 5.96 0.0 0.499 5.933 68.200 3.360 5 279.0
36 0.097 0.0 5.96 0.0 0.499 5.841 61.400 3.378 5 279.0
37 0.080 0.0 5.96 0.0 0.499 5.850 41.500 3.934 5 279.0
38 0.175 0.0 5.96 0.0 0.499 5.966 30.200 3.847 5 279.0
39 0.028 75.0 2.95 0.0 0.428 6.595 21.800 5.401 3 252.0
40 0.034 75.0 2.95 0.0 0.428 7.024 15.800 NaN 3 252.0
41 0.127 0.0 6.91 0.0 0.448 6.770 2.900 5.721 3 233.0
42 0.141 0.0 6.91 0.0 0.448 6.169 6.600 5.721 3 233.0
43 0.159 0.0 6.91 0.0 0.448 6.211 6.500 5.721 3 233.0
44 0.123 0.0 6.91 0.0 0.448 6.069 40.000 5.721 3 233.0
45 0.171 0.0 6.91 0.0 0.448 5.682 33.800 5.100 3 233.0
46 0.188 0.0 6.91 0.0 0.448 5.786 33.300 5.100 3 233.0
47 0.229 0.0 6.91 0.0 0.448 6.030 85.500 5.689 3 233.0
48 0.254 0.0 6.91 0.0 0.448 5.399 95.300 5.870 3 233.0
49 0.220 0.0 6.91 0.0 0.448 5.602 62.000 6.088 3 233.0
50 0.089 21.0 5.64 0.0 0.439 5.963 45.700 6.815 4 243.0
51 0.043 21.0 5.64 0.0 0.439 6.115 63.000 6.815 4 243.0
52 0.054 21.0 5.64 0.0 0.439 6.511 21.100 6.815 4 243.0
53 0.050 21.0 5.64 0.0 0.439 5.998 21.400 6.815 4 243.0
54 0.014 75.0 4.00 0.0 0.410 5.888 47.600 7.320 3 469.0
55 0.013 90.0 1.22 0.0 0.403 7.249 21.900 8.697 5 226.0
56 0.021 85.0 0.74 0.0 0.410 6.383 35.700 9.188 2 313.0
57 0.014 100.0 1.32 0.0 0.411 6.816 40.500 8.325 5 256.0
58 0.154 25.0 5.13 0.0 0.453 6.145 29.200 7.815 8 284.0
59 0.103 25.0 5.13 0.0 0.453 5.927 47.200 6.932 8 284.0
60 0.149 25.0 5.13 0.0 0.453 5.741 66.200 NaN 8 284.0
61 0.172 25.0 5.13 0.0 0.453 5.966 93.400 6.819 8 284.0
62 0.110 25.0 5.13 0.0 0.453 6.456 67.800 7.226 8 284.0
63 0.127 25.0 5.13 0.0 0.453 6.762 43.400 7.981 8 284.0
64 0.020 17.5 1.38 0.0 0.416 7.104 59.500 9.223 3 216.0
65 0.036 80.0 3.37 0.0 0.398 6.290 17.800 6.612 4 337.0
66 0.044 80.0 3.37 0.0 0.398 5.787 31.100 6.612 4 337.0
67 0.058 12.5 6.07 0.0 0.409 5.878 21.400 6.498 4 345.0
68 0.136 12.5 6.07 0.0 0.409 5.594 36.800 6.498 4 345.0
69 0.128 12.5 6.07 0.0 0.409 5.885 33.000 6.498 4 345.0
70 0.088 0.0 10.81 0.0 0.413 6.417 6.600 5.287 4 305.0
71 0.159 0.0 10.81 0.0 0.413 5.961 17.500 5.287 4 305.0
72 0.092 0.0 10.81 0.0 0.413 6.065 7.800 5.287 4 305.0
73 0.195 0.0 10.81 0.0 0.413 6.245 6.200 5.287 4 305.0
74 0.079 0.0 12.83 0.0 0.437 6.273 6.000 4.252 5 398.0
75 0.095 0.0 12.83 0.0 0.437 6.286 45.000 4.503 5 398.0
76 0.102 0.0 12.83 0.0 0.437 6.279 74.500 4.052 5 398.0
77 0.087 0.0 12.83 0.0 0.437 6.140 45.800 4.090 5 398.0
78 0.056 0.0 12.83 0.0 0.437 6.232 53.700 NaN 5 398.0
79 0.084 0.0 12.83 0.0 0.437 5.874 36.600 4.503 5 398.0
80 0.041 25.0 4.86 NaN 0.426 6.727 33.500 5.401 4 281.0
81 0.045 25.0 4.86 0.0 0.426 6.619 70.400 5.401 4 281.0
82 0.037 25.0 4.86 0.0 0.426 6.302 32.200 5.401 4 281.0
83 0.036 25.0 4.86 0.0 0.426 6.167 46.700 5.401 4 281.0
84 0.051 0.0 4.49 0.0 0.449 6.389 48.000 4.779 3 247.0
85 0.057 0.0 4.49 0.0 0.449 6.630 56.100 4.438 3 247.0
86 0.052 0.0 4.49 0.0 0.449 6.015 45.100 4.427 3 247.0
87 0.072 0.0 4.49 0.0 0.449 6.121 56.800 3.748 3 247.0
88 0.057 0.0 3.41 0.0 0.489 7.007 86.300 3.422 2 270.0
89 0.053 0.0 3.41 0.0 0.489 7.079 63.100 3.414 2 270.0
90 0.047 0.0 3.41 0.0 0.489 6.417 66.100 3.092 2 270.0
91 0.039 0.0 3.41 0.0 0.489 6.405 73.900 3.092 2 270.0
92 0.042 28.0 15.04 0.0 0.464 6.442 53.600 3.666 4 270.0
93 0.029 28.0 15.04 NaN 0.464 6.211 28.900 3.666 4 270.0
94 0.043 28.0 15.04 0.0 0.464 6.249 77.300 3.615 4 270.0
95 0.122 0.0 2.89 0.0 0.445 6.625 57.800 3.495 2 276.0
96 0.115 0.0 2.89 0.0 0.445 6.163 69.600 3.495 2 276.0
97 0.121 0.0 2.89 0.0 0.445 8.069 76.000 3.495 2 276.0
98 0.082 0.0 2.89 0.0 0.445 7.820 36.900 NaN 2 276.0
99 0.069 0.0 2.89 0.0 0.445 7.416 62.500 3.495 2 276.0
100 0.149 0.0 8.56 0.0 0.520 6.727 79.900 2.778 5 384.0
101 0.114 0.0 8.56 0.0 0.520 6.781 71.300 2.856 5 384.0
102 0.229 0.0 8.56 0.0 0.520 6.405 85.400 2.715 5 384.0
103 0.212 0.0 8.56 0.0 0.520 6.137 87.400 2.715 5 384.0
104 0.140 0.0 8.56 0.0 0.520 6.167 90.000 2.421 5 384.0
105 0.133 0.0 8.56 0.0 0.520 5.851 96.700 2.107 5 384.0
106 0.171 0.0 8.56 0.0 0.520 5.836 91.900 2.211 5 384.0
107 0.131 0.0 8.56 0.0 0.520 6.127 85.200 2.122 5 384.0
108 0.128 0.0 8.56 0.0 0.520 6.474 97.100 2.433 5 384.0
109 0.264 0.0 8.56 0.0 0.520 6.229 91.200 2.545 5 384.0
110 0.108 0.0 8.56 0.0 0.520 6.195 54.400 2.778 5 384.0
111 0.101 0.0 10.01 0.0 0.547 6.715 81.600 2.678 6 432.0
112 0.123 0.0 10.01 0.0 0.547 5.913 92.900 2.353 6 432.0
113 0.222 0.0 10.01 0.0 0.547 6.092 95.400 2.548 6 432.0
114 0.142 0.0 10.01 0.0 0.547 6.254 84.200 2.256 6 432.0
115 0.171 0.0 10.01 0.0 0.547 5.928 88.200 2.463 6 432.0
116 0.132 0.0 10.01 0.0 0.547 6.176 72.500 2.730 6 432.0
117 0.151 0.0 10.01 0.0 0.547 6.021 82.600 2.747 6 432.0
118 0.131 0.0 10.01 0.0 0.547 5.872 73.100 2.478 6 432.0
119 0.145 0.0 10.01 0.0 0.547 5.731 65.200 2.759 6 432.0
120 0.069 0.0 25.65 0.0 0.581 5.870 69.700 2.258 2 188.0
121 0.072 0.0 25.65 0.0 0.581 6.004 84.100 2.197 2 188.0
122 0.093 0.0 25.65 0.0 0.581 5.961 92.900 NaN 2 188.0
123 0.150 0.0 25.65 0.0 0.581 5.856 97.000 1.944 2 188.0
124 0.098 0.0 25.65 0.0 0.581 5.879 95.800 2.006 2 188.0
125 0.169 0.0 25.65 0.0 0.581 5.986 88.400 1.993 2 188.0
126 0.387 0.0 25.65 0.0 0.581 5.613 95.600 1.757 2 188.0
127 0.259 0.0 21.89 0.0 0.624 5.693 96.000 1.788 4 437.0
128 0.325 0.0 21.89 0.0 0.624 6.431 98.800 1.812 4 437.0
129 0.881 0.0 21.89 0.0 0.624 5.637 94.700 1.980 4 437.0
130 0.340 0.0 21.89 0.0 0.624 6.458 98.900 2.119 4 437.0
131 1.193 0.0 21.89 0.0 0.624 6.326 97.700 2.271 4 437.0
132 0.590 0.0 21.89 0.0 0.624 6.372 97.900 2.327 4 437.0
133 0.330 0.0 21.89 0.0 0.624 5.822 95.400 2.470 4 437.0
134 0.976 0.0 21.89 NaN 0.624 5.757 98.400 2.346 4 437.0
135 0.558 0.0 21.89 0.0 0.624 6.335 98.200 2.111 4 437.0
136 0.323 0.0 21.89 0.0 0.624 5.942 93.500 1.967 4 437.0
137 0.352 0.0 21.89 0.0 0.624 6.454 98.400 1.850 4 437.0
138 0.250 0.0 21.89 0.0 0.624 5.857 98.200 1.669 4 437.0
139 0.545 0.0 21.89 0.0 0.624 6.151 97.900 1.669 4 437.0
140 0.291 0.0 21.89 0.0 0.624 6.174 93.600 1.612 4 437.0
141 1.629 0.0 21.89 0.0 0.624 5.019 100.000 1.439 4 437.0
142 3.321 0.0 19.58 1.0 0.871 5.403 100.000 1.322 5 403.0
143 4.097 0.0 19.58 0.0 0.871 5.468 100.000 1.412 5 403.0
144 2.780 0.0 19.58 0.0 0.871 4.903 97.800 1.346 5 403.0
145 2.379 0.0 19.58 NaN 0.871 6.130 100.000 1.419 5 403.0
146 2.155 0.0 19.58 0.0 0.871 5.628 100.000 1.517 5 403.0
147 2.369 0.0 19.58 0.0 0.871 4.926 95.700 NaN 5 403.0
148 2.331 0.0 19.58 0.0 0.871 5.186 93.800 1.530 5 403.0
149 2.734 0.0 19.58 0.0 0.871 5.597 94.900 1.526 5 403.0
150 1.657 0.0 19.58 0.0 0.871 6.122 97.300 1.618 5 403.0
151 1.496 0.0 19.58 0.0 0.871 5.404 100.000 1.592 5 403.0
152 1.127 0.0 19.58 1.0 0.871 5.012 88.000 1.610 5 403.0
153 2.149 0.0 19.58 0.0 0.871 5.709 98.500 1.623 5 403.0
154 1.414 0.0 19.58 1.0 0.871 6.129 96.000 1.749 5 403.0
155 3.535 0.0 19.58 1.0 0.871 6.152 82.600 1.746 5 403.0
156 2.447 0.0 19.58 0.0 0.871 5.272 94.000 1.736 5 403.0
157 1.224 0.0 19.58 0.0 0.605 6.943 97.400 1.877 5 403.0
158 1.343 0.0 19.58 0.0 0.605 6.066 100.000 1.757 5 403.0
159 1.425 0.0 19.58 0.0 0.871 6.510 100.000 1.766 5 403.0
160 1.273 0.0 19.58 1.0 0.605 6.250 92.600 1.798 5 403.0
161 1.463 0.0 19.58 0.0 0.605 7.489 90.800 1.971 5 403.0
162 1.834 0.0 19.58 NaN 0.605 7.802 98.200 2.041 5 403.0
163 1.519 0.0 19.58 1.0 0.605 8.375 93.900 2.162 5 403.0
164 2.242 0.0 19.58 0.0 0.605 5.854 91.800 2.422 5 403.0
165 2.924 0.0 19.58 NaN 0.605 6.101 93.000 2.283 5 403.0
166 2.010 0.0 19.58 0.0 0.605 7.929 96.200 2.046 5 403.0
167 1.800 0.0 19.58 0.0 0.605 5.877 79.200 2.426 5 403.0
168 2.300 0.0 19.58 0.0 0.605 6.319 96.100 2.100 5 403.0
169 2.450 0.0 19.58 0.0 0.605 6.402 95.200 2.263 5 403.0
170 1.207 0.0 19.58 0.0 0.605 5.875 94.600 2.426 5 403.0
171 2.314 0.0 19.58 0.0 0.605 5.880 97.300 2.389 5 403.0
172 0.139 0.0 4.05 0.0 0.510 5.572 88.500 2.596 5 296.0
173 0.092 0.0 4.05 0.0 0.510 6.416 84.100 2.646 5 296.0
174 0.084 0.0 4.05 0.0 0.510 5.859 68.700 2.702 5 296.0
175 0.067 0.0 4.05 0.0 0.510 6.546 33.100 3.132 5 296.0
176 0.070 0.0 4.05 0.0 0.510 6.020 47.200 3.555 5 296.0
177 0.054 0.0 4.05 0.0 0.510 6.315 73.400 3.317 5 296.0
178 0.066 0.0 4.05 NaN 0.510 6.860 74.400 2.915 5 296.0
179 0.058 0.0 2.46 0.0 0.488 6.980 58.400 2.829 3 193.0
180 0.066 0.0 2.46 0.0 0.488 7.765 83.300 NaN 3 193.0
181 0.069 0.0 2.46 0.0 0.488 6.144 62.200 2.598 3 193.0
182 0.091 0.0 2.46 0.0 0.488 7.155 92.200 2.701 3 193.0
183 0.100 0.0 2.46 0.0 0.488 6.563 95.600 2.847 3 193.0
184 0.083 0.0 2.46 0.0 0.488 5.604 89.800 2.988 3 193.0
185 0.060 0.0 2.46 0.0 0.488 6.153 68.800 3.280 3 193.0
186 0.056 0.0 2.46 0.0 0.488 7.831 53.600 3.199 3 193.0
187 0.079 45.0 3.44 0.0 0.437 6.782 41.100 3.789 5 398.0
188 0.126 45.0 3.44 0.0 0.437 6.556 29.100 4.567 5 398.0
189 0.084 45.0 3.44 0.0 0.437 7.185 38.900 4.567 5 398.0
190 0.091 45.0 3.44 0.0 0.437 6.951 21.500 6.480 5 398.0
191 0.069 45.0 3.44 0.0 0.437 6.739 30.800 6.480 5 398.0
192 0.087 45.0 3.44 0.0 0.437 7.178 26.300 6.480 5 398.0
193 0.022 60.0 2.93 0.0 0.401 6.800 9.900 6.220 1 265.0
194 0.014 60.0 2.93 NaN 0.401 6.604 18.800 6.220 1 265.0
195 0.014 80.0 0.46 0.0 0.422 7.875 32.000 5.648 4 255.0
196 0.040 80.0 1.52 0.0 0.404 7.287 34.100 7.309 2 329.0
197 0.047 80.0 1.52 0.0 0.404 7.107 36.600 7.309 2 329.0
198 0.038 80.0 1.52 0.0 0.404 7.274 38.300 7.309 2 329.0
199 0.032 95.0 1.47 0.0 0.403 6.975 15.300 7.653 3 402.0
200 0.018 95.0 1.47 0.0 0.403 7.135 13.900 7.653 3 402.0
201 0.034 82.5 2.03 0.0 0.415 6.162 38.400 6.270 2 348.0
202 0.022 82.5 2.03 0.0 0.415 7.610 15.700 6.270 2 348.0
203 0.035 95.0 2.68 NaN 0.416 7.853 33.200 5.118 4 224.0
204 0.020 95.0 2.68 0.0 0.416 8.034 31.900 5.118 4 224.0
205 0.136 0.0 10.59 0.0 0.489 5.891 22.300 3.945 4 277.0
206 0.230 0.0 10.59 0.0 0.489 6.326 52.500 4.355 4 277.0
207 0.252 0.0 10.59 0.0 0.489 5.783 72.700 4.355 4 277.0
208 0.136 0.0 10.59 1.0 0.489 6.064 59.100 4.239 4 277.0
209 0.436 0.0 10.59 1.0 0.489 5.344 100.000 NaN 4 277.0
210 0.174 0.0 10.59 1.0 0.489 5.960 92.100 3.877 4 277.0
211 0.376 0.0 10.59 1.0 0.489 5.404 88.600 3.665 4 277.0
212 0.217 0.0 10.59 1.0 0.489 5.807 53.800 3.653 4 277.0
213 0.141 0.0 10.59 0.0 0.489 6.375 32.300 3.945 4 277.0
214 0.290 0.0 10.59 0.0 0.489 5.412 9.800 3.587 4 277.0
215 0.198 0.0 10.59 0.0 0.489 6.182 42.400 3.945 4 277.0
216 0.046 0.0 13.89 1.0 0.550 5.888 56.000 3.112 5 276.0
217 0.070 0.0 13.89 0.0 0.550 6.642 85.100 3.421 5 276.0
218 0.111 0.0 13.89 1.0 0.550 5.951 93.800 2.889 5 276.0
219 0.114 0.0 13.89 1.0 0.550 6.373 92.400 3.363 5 276.0
220 0.358 0.0 6.20 1.0 0.507 6.951 88.500 2.862 8 307.0
221 0.408 0.0 6.20 1.0 0.507 6.164 91.300 3.048 8 307.0
222 0.624 0.0 6.20 NaN 0.507 6.879 77.700 3.272 8 307.0
223 0.615 0.0 6.20 0.0 0.507 6.618 80.800 3.272 8 307.0
224 0.315 0.0 6.20 0.0 0.504 8.266 78.300 2.894 8 307.0
225 0.527 0.0 6.20 0.0 0.504 8.725 83.000 2.894 8 307.0
226 0.382 0.0 6.20 0.0 0.504 8.040 86.500 3.216 8 307.0
227 0.412 0.0 6.20 0.0 0.504 7.163 79.900 3.216 8 307.0
228 0.298 0.0 6.20 0.0 0.504 7.686 17.000 3.375 8 307.0
229 0.442 0.0 6.20 0.0 0.504 6.552 21.400 3.375 8 307.0
230 0.537 0.0 6.20 0.0 0.504 5.981 68.100 3.671 8 307.0
231 0.463 0.0 6.20 0.0 0.504 7.412 76.900 3.671 8 307.0
232 0.575 0.0 6.20 0.0 0.507 8.337 73.300 3.838 8 307.0
233 0.331 0.0 6.20 0.0 0.507 8.247 70.400 3.652 8 307.0
234 0.448 0.0 6.20 1.0 0.507 6.726 66.500 NaN 8 307.0
235 0.330 0.0 6.20 0.0 0.507 6.086 61.500 3.652 8 307.0
236 0.521 0.0 6.20 1.0 0.507 6.631 76.500 4.148 8 307.0
237 0.512 0.0 6.20 0.0 0.507 7.358 71.600 4.148 8 307.0
238 0.082 30.0 4.93 0.0 0.428 6.481 18.500 6.190 6 300.0
239 0.093 30.0 4.93 0.0 0.428 6.606 42.200 6.190 6 300.0
240 0.113 30.0 4.93 0.0 0.428 6.897 54.300 6.336 6 300.0
241 0.106 30.0 4.93 0.0 0.428 6.095 65.100 6.336 6 300.0
242 0.103 30.0 4.93 0.0 0.428 6.358 52.900 7.035 6 300.0
243 0.128 30.0 4.93 0.0 0.428 6.393 7.800 7.035 6 300.0
244 0.206 22.0 5.86 0.0 0.431 5.593 76.500 7.955 7 330.0
245 0.191 22.0 5.86 0.0 0.431 5.605 70.200 7.955 7 330.0
246 0.340 22.0 5.86 0.0 0.431 6.108 34.900 8.056 7 330.0
247 0.197 22.0 5.86 0.0 0.431 6.226 79.200 8.056 7 330.0
248 0.164 22.0 5.86 0.0 0.431 6.433 49.100 7.827 7 330.0
249 0.191 22.0 5.86 0.0 0.431 6.718 17.500 7.827 7 330.0
250 0.140 22.0 5.86 0.0 0.431 6.487 13.000 7.397 7 330.0
251 0.214 22.0 5.86 0.0 0.431 6.438 8.900 7.397 7 330.0
252 0.082 22.0 5.86 0.0 0.431 6.957 6.800 8.907 7 330.0
253 0.369 22.0 5.86 0.0 0.431 8.259 8.400 8.907 7 330.0
254 0.048 80.0 3.64 0.0 0.392 6.108 32.000 9.220 1 315.0
255 0.035 80.0 3.64 0.0 0.392 5.876 19.100 9.220 1 315.0
256 0.015 90.0 3.75 0.0 0.394 7.454 34.200 6.336 3 244.0
257 0.612 20.0 3.97 0.0 0.647 8.704 86.900 1.801 5 264.0
258 0.664 20.0 3.97 0.0 0.647 7.333 100.000 1.895 5 264.0
259 0.657 20.0 3.97 0.0 0.647 6.842 100.000 2.011 5 264.0
260 0.540 20.0 3.97 0.0 0.647 7.203 81.800 2.112 5 264.0
261 0.534 20.0 3.97 0.0 0.647 7.520 89.400 2.140 5 264.0
262 0.520 20.0 3.97 0.0 0.647 8.398 91.500 2.288 5 264.0
263 0.825 20.0 3.97 0.0 0.647 7.327 94.500 2.079 5 264.0
264 0.550 20.0 3.97 0.0 0.647 7.206 91.600 NaN 5 264.0
265 0.762 20.0 3.97 0.0 0.647 5.560 62.800 1.986 5 264.0
266 0.786 20.0 3.97 0.0 0.647 7.014 84.600 2.133 5 264.0
267 0.578 20.0 3.97 0.0 0.575 8.297 67.000 2.422 5 264.0
268 0.540 20.0 3.97 0.0 0.575 7.470 52.600 2.872 5 264.0
269 0.091 20.0 6.96 1.0 0.464 5.920 61.500 3.917 3 223.0
270 0.299 20.0 6.96 NaN 0.464 5.856 42.100 4.429 3 223.0
271 0.162 20.0 6.96 0.0 0.464 6.240 16.300 4.429 3 223.0
272 0.115 20.0 6.96 0.0 0.464 6.538 58.700 3.917 3 223.0
273 0.222 20.0 6.96 1.0 0.464 7.691 51.800 4.367 3 223.0
274 0.056 40.0 6.41 1.0 0.447 6.758 32.900 4.078 4 254.0
275 0.096 40.0 6.41 0.0 0.447 6.854 42.800 4.267 4 254.0
276 0.105 40.0 6.41 1.0 0.447 7.267 49.000 4.787 4 254.0
277 0.061 40.0 6.41 1.0 0.447 6.826 27.600 4.863 4 254.0
278 0.080 40.0 6.41 0.0 0.447 6.482 32.100 4.140 4 254.0
279 0.210 20.0 3.33 0.0 0.443 6.812 32.200 NaN 5 216.0
280 0.036 20.0 3.33 0.0 0.443 7.820 64.500 NaN 5 216.0
281 0.037 20.0 3.33 0.0 0.443 6.968 37.200 NaN 5 216.0
282 0.061 20.0 3.33 1.0 0.443 7.645 49.700 NaN 5 216.0
283 0.015 90.0 1.21 1.0 0.401 7.923 24.800 NaN 1 198.0
284 0.009 90.0 2.97 0.0 0.400 7.088 20.800 NaN 1 285.0
285 0.011 55.0 2.25 0.0 0.389 6.453 31.900 7.307 1 300.0
286 0.020 80.0 1.76 NaN 0.385 6.230 31.500 9.089 1 241.0
287 0.039 52.5 5.32 0.0 0.405 6.209 31.300 7.317 6 293.0
288 0.046 52.5 5.32 0.0 0.405 6.315 45.600 7.317 6 293.0
289 0.043 52.5 5.32 0.0 0.405 6.565 22.900 7.317 6 293.0
290 0.035 80.0 4.95 0.0 0.411 6.861 27.900 5.117 4 245.0
291 0.079 80.0 4.95 0.0 0.411 7.148 27.700 5.117 4 245.0
292 0.036 80.0 4.95 0.0 0.411 6.630 23.400 5.117 4 245.0
293 0.083 0.0 13.92 0.0 0.437 6.127 18.400 5.503 4 289.0
294 0.082 0.0 13.92 0.0 0.437 6.009 42.300 5.503 4 289.0
295 0.129 0.0 13.92 0.0 0.437 6.678 31.100 5.960 4 289.0
296 0.054 0.0 13.92 0.0 0.437 6.549 51.000 5.960 4 289.0
297 0.141 0.0 13.92 NaN 0.437 5.790 58.000 6.320 4 289.0
298 0.065 70.0 2.24 0.0 0.400 6.345 20.100 7.828 5 358.0
299 0.056 70.0 2.24 0.0 0.400 7.041 10.000 7.828 5 358.0
300 0.044 70.0 2.24 NaN 0.400 6.871 47.400 7.828 5 358.0
301 0.035 34.0 6.09 NaN 0.433 6.590 40.400 5.492 7 329.0
302 0.093 34.0 6.09 0.0 0.433 6.495 18.400 5.492 7 329.0
303 0.100 34.0 6.09 0.0 0.433 6.982 17.700 5.492 7 329.0
304 0.055 33.0 2.18 0.0 0.472 7.236 41.100 4.022 7 222.0
305 0.055 33.0 2.18 0.0 0.472 6.616 58.100 3.370 7 222.0
306 0.075 33.0 2.18 0.0 0.472 7.420 71.900 3.099 7 222.0
307 0.049 33.0 2.18 0.0 0.472 6.849 70.300 3.183 7 222.0
308 0.493 0.0 9.90 0.0 0.544 6.635 82.500 3.317 4 304.0
309 0.349 0.0 9.90 0.0 0.544 5.972 76.700 3.103 4 304.0
310 2.635 0.0 9.90 0.0 0.544 4.973 37.800 NaN 4 304.0
311 0.790 0.0 9.90 0.0 0.544 6.122 52.800 2.640 4 304.0
312 0.262 0.0 9.90 0.0 0.544 6.023 90.400 2.834 4 304.0
313 0.269 0.0 9.90 0.0 0.544 6.266 82.800 3.263 4 304.0
314 0.369 0.0 9.90 0.0 0.544 6.567 87.300 3.602 4 304.0
315 0.254 0.0 9.90 0.0 0.544 5.705 77.700 3.945 4 304.0
316 0.318 0.0 9.90 0.0 0.544 5.914 83.200 3.999 4 304.0
317 0.245 0.0 9.90 0.0 0.544 5.782 71.700 4.032 4 304.0
318 0.402 0.0 9.90 0.0 0.544 6.382 67.200 3.533 4 304.0
319 0.475 0.0 9.90 0.0 0.544 6.113 58.800 4.002 4 304.0
320 0.168 0.0 7.38 0.0 0.493 6.426 52.300 4.540 5 287.0
321 0.182 0.0 7.38 0.0 0.493 6.376 54.300 4.540 5 287.0
322 0.351 0.0 7.38 0.0 0.493 6.041 49.900 4.721 5 287.0
323 0.284 0.0 7.38 0.0 0.493 5.708 74.300 4.721 5 287.0
324 0.341 0.0 7.38 0.0 0.493 6.415 40.100 4.721 5 287.0
325 0.192 0.0 7.38 0.0 0.493 6.431 14.700 NaN 5 287.0
326 0.303 0.0 7.38 0.0 0.493 6.312 28.900 5.416 5 287.0
327 0.241 0.0 7.38 0.0 0.493 6.083 43.700 5.416 5 287.0
328 0.066 0.0 3.24 0.0 0.460 5.868 25.800 5.215 4 430.0
329 0.067 0.0 3.24 0.0 0.460 6.333 17.200 5.215 4 430.0
330 0.045 0.0 3.24 0.0 0.460 6.144 32.200 5.874 4 430.0
331 0.050 35.0 6.06 0.0 0.438 5.706 28.400 6.641 1 304.0
332 0.035 35.0 6.06 0.0 0.438 6.031 23.300 6.641 1 304.0
333 0.051 0.0 5.19 0.0 0.515 6.316 38.100 6.458 5 224.0
334 0.037 0.0 5.19 0.0 0.515 6.310 38.500 6.458 5 224.0
335 0.040 0.0 5.19 0.0 0.515 6.037 34.500 5.985 5 224.0
336 0.034 0.0 5.19 0.0 0.515 5.869 46.300 5.231 5 224.0
337 0.030 0.0 5.19 0.0 0.515 5.895 59.600 5.615 5 224.0
338 0.033 0.0 5.19 0.0 0.515 6.059 37.300 4.812 5 224.0
339 0.055 0.0 5.19 0.0 0.515 5.985 45.400 4.812 5 224.0
340 0.062 0.0 5.19 0.0 0.515 5.968 58.500 NaN 5 224.0
341 0.013 35.0 1.52 0.0 0.442 7.241 49.300 7.038 1 284.0
342 0.025 0.0 1.89 0.0 0.518 6.540 59.700 6.267 1 422.0
343 0.025 55.0 3.78 0.0 0.484 6.696 56.400 5.732 5 370.0
344 0.030 55.0 3.78 0.0 0.484 6.874 28.100 6.465 5 370.0
345 0.031 0.0 4.39 0.0 0.442 6.014 48.500 8.014 3 352.0
346 0.062 0.0 4.39 0.0 0.442 5.898 52.300 8.014 3 352.0
347 0.019 85.0 4.15 0.0 0.429 6.516 27.700 8.535 4 351.0
348 0.015 80.0 2.01 0.0 0.435 6.635 29.700 8.344 4 280.0
349 0.029 40.0 1.25 0.0 0.429 6.939 34.500 8.792 1 335.0
350 0.062 40.0 1.25 0.0 0.429 6.490 44.400 8.792 1 335.0
351 0.080 60.0 1.69 0.0 0.411 6.579 35.900 10.710 4 411.0
352 0.072 60.0 1.69 0.0 0.411 5.884 18.500 10.710 4 411.0
353 0.017 90.0 2.02 0.0 0.410 6.728 36.100 12.127 5 187.0
354 0.043 80.0 1.91 0.0 0.413 5.663 21.900 10.586 4 334.0
355 0.107 80.0 1.91 0.0 0.413 5.936 19.500 10.586 4 334.0
356 8.983 0.0 18.10 1.0 0.770 6.212 97.400 2.122 24 666.0
357 3.850 0.0 18.10 1.0 0.770 6.395 91.000 2.505 24 666.0
358 5.202 0.0 18.10 1.0 0.770 6.127 83.400 2.723 24 666.0
359 4.261 0.0 18.10 0.0 0.770 6.112 81.300 2.509 24 666.0
360 4.542 0.0 18.10 0.0 0.770 6.398 88.000 NaN 24 666.0
361 3.837 0.0 18.10 0.0 0.770 6.251 91.100 2.296 24 666.0
362 3.678 0.0 18.10 0.0 0.770 5.362 96.200 2.104 24 666.0
363 4.222 0.0 18.10 1.0 0.770 5.803 89.000 1.905 24 666.0
364 3.474 0.0 18.10 NaN 0.718 8.780 82.900 1.905 24 666.0
365 4.556 0.0 18.10 0.0 0.718 3.561 87.900 1.613 24 666.0
366 3.697 0.0 18.10 0.0 0.718 4.963 91.400 1.752 24 666.0
367 0.000 18.1 0.00 1.0 3.863 100.000 1.511 24.000 666 20.2
368 4.898 0.0 18.10 0.0 0.631 4.970 100.000 1.333 24 666.0
369 5.670 0.0 18.10 1.0 0.631 6.683 96.800 1.357 24 666.0
370 6.539 0.0 18.10 1.0 0.631 7.016 97.500 1.202 24 666.0
371 9.232 0.0 18.10 0.0 0.631 6.216 100.000 1.169 24 666.0
372 8.267 0.0 18.10 1.0 0.668 5.875 89.600 1.130 24 666.0
373 0.000 18.1 0.00 1.0 4.906 100.000 1.174 24.000 666 20.2
374 0.000 18.1 0.00 1.0 4.138 100.000 1.137 24.000 666 20.2
375 0.000 18.1 0.00 1.0 7.313 97.900 1.316 24.000 666 20.2
376 0.000 18.1 0.00 1.0 6.649 93.300 1.345 24.000 666 20.2
377 9.823 0.0 18.10 0.0 0.671 6.794 98.800 1.358 24 666.0
378 0.000 18.1 0.00 1.0 6.380 96.200 1.386 24.000 666 20.2
379 0.000 18.1 0.00 1.0 6.223 100.000 1.386 24.000 666 20.2
380 0.000 18.1 0.00 1.0 6.968 91.900 1.417 NaN 666 20.2
381 0.000 18.1 0.00 1.0 6.545 99.100 1.519 24.000 666 20.2
382 9.187 0.0 18.10 0.0 0.700 5.536 100.000 1.580 24 666.0
383 7.992 0.0 18.10 0.0 0.700 5.520 100.000 1.533 24 666.0
384 0.000 18.1 0.00 1.0 4.368 91.200 1.440 24.000 666 20.2
385 0.000 18.1 0.00 1.0 5.277 98.100 1.426 24.000 666 20.2
386 0.000 18.1 0.00 1.0 4.652 100.000 1.467 24.000 666 20.2
387 0.000 18.1 0.00 1.0 5.000 89.500 1.518 24.000 666 20.2
388 0.000 18.1 0.00 1.0 4.880 100.000 1.589 24.000 666 20.2
389 8.152 0.0 18.10 0.0 0.700 5.390 98.900 1.728 24 666.0
390 6.962 0.0 18.10 0.0 0.700 5.713 97.000 1.927 24 666.0
391 5.293 0.0 18.10 0.0 0.700 6.051 82.500 2.168 24 666.0
392 0.000 18.1 0.00 1.0 5.036 97.000 1.770 24.000 666 20.2
393 8.645 0.0 18.10 0.0 0.693 6.193 92.600 1.791 24 666.0
394 0.000 18.1 0.00 1.0 5.887 94.700 1.782 24.000 666 20.2
395 8.717 0.0 18.10 0.0 0.693 6.471 98.800 1.726 24 666.0
396 5.872 0.0 18.10 0.0 0.693 6.405 96.000 1.677 24 666.0
397 7.672 0.0 18.10 0.0 0.693 5.747 98.900 1.633 24 666.0
398 0.000 18.1 0.00 1.0 5.453 100.000 1.490 24.000 666 20.2
399 9.917 0.0 18.10 0.0 0.693 5.852 77.800 1.500 24 666.0
400 0.000 18.1 0.00 1.0 5.987 100.000 1.589 NaN 666 20.2
401 0.000 18.1 0.00 1.0 6.343 100.000 1.574 24.000 666 20.2
402 9.596 0.0 18.10 0.0 0.693 6.404 100.000 1.639 24 666.0
403 0.000 18.1 0.00 NaN 5.349 96.000 1.703 24.000 666 20.2
404 0.000 18.1 0.00 1.0 5.531 85.400 1.607 24.000 666 20.2
405 0.000 18.1 0.00 1.0 5.683 100.000 1.425 24.000 666 20.2
406 0.000 18.1 0.00 1.0 4.138 100.000 1.178 24.000 666 20.2
407 0.000 18.1 0.00 1.0 5.608 100.000 1.285 24.000 666 20.2
408 7.404 0.0 18.10 0.0 0.597 5.617 97.900 1.455 24 666.0
409 0.000 18.1 0.00 1.0 6.852 100.000 1.466 24.000 666 20.2
410 0.000 18.1 0.00 1.0 5.757 100.000 1.413 24.000 666 20.2
411 0.000 18.1 0.00 1.0 6.657 100.000 1.528 24.000 666 20.2
412 0.000 18.1 0.00 1.0 4.628 100.000 1.554 24.000 666 20.2
413 0.000 18.1 0.00 1.0 5.155 100.000 1.589 24.000 666 20.2
414 0.000 18.1 0.00 1.0 4.519 100.000 1.658 24.000 666 20.2
415 0.000 18.1 0.00 1.0 6.434 100.000 1.835 24.000 666 20.2
416 0.000 18.1 0.00 1.0 6.782 90.800 1.819 24.000 666 20.2
417 0.000 18.1 0.00 1.0 5.304 89.100 1.647 24.000 666 20.2
418 0.000 18.1 0.00 1.0 5.957 100.000 1.803 24.000 666 20.2
419 0.000 18.1 0.00 1.0 6.824 76.500 1.794 24.000 666 20.2
420 0.000 18.1 0.00 1.0 6.411 100.000 1.859 24.000 666 20.2
421 7.023 0.0 18.10 0.0 0.718 6.006 95.300 1.875 24 666.0
422 0.000 18.1 0.00 1.0 5.648 87.600 1.951 24.000 666 20.2
423 7.050 0.0 18.10 0.0 0.614 6.103 85.100 2.022 24 666.0
424 8.792 0.0 18.10 0.0 0.584 5.565 70.600 2.063 24 666.0
425 0.000 18.1 0.00 1.0 5.896 95.400 1.910 24.000 666 20.2
426 0.000 18.1 0.00 1.0 5.837 59.700 1.998 24.000 666 20.2
427 0.000 18.1 0.00 1.0 6.202 78.700 1.863 24.000 666 20.2
428 7.367 0.0 18.10 0.0 0.679 6.193 78.100 NaN 24 666.0
429 9.339 0.0 18.10 0.0 0.679 6.380 95.600 1.968 24 666.0
430 8.492 0.0 18.10 0.0 0.584 6.348 86.100 2.053 24 666.0
431 0.000 18.1 0.00 1.0 6.833 94.300 2.088 24.000 666 20.2
432 6.444 0.0 18.10 0.0 0.584 6.425 74.800 2.200 24 666.0
433 5.581 0.0 18.10 0.0 0.713 6.436 87.900 2.316 24 666.0
434 0.000 18.1 0.00 1.0 6.208 95.000 2.222 24.000 666 20.2
435 0.000 18.1 0.00 1.0 6.629 94.600 2.125 24.000 666 20.2
436 0.000 18.1 0.00 1.0 6.461 93.300 2.003 24.000 666 20.2
437 0.000 18.1 0.00 1.0 6.152 100.000 1.914 24.000 666 20.2
438 0.000 18.1 0.00 1.0 5.935 87.900 1.821 24.000 666 20.2
439 9.391 0.0 18.10 0.0 0.740 5.627 93.900 1.817 24 666.0
440 0.000 18.1 0.00 NaN 5.818 92.400 1.866 24.000 666 20.2
441 9.724 0.0 18.10 0.0 0.740 6.406 97.200 2.065 24 666.0
442 5.666 0.0 18.10 0.0 0.740 6.219 100.000 2.005 24 666.0
443 9.967 0.0 18.10 0.0 0.740 6.485 100.000 1.978 24 666.0
444 0.000 18.1 0.00 1.0 5.854 96.600 1.896 24.000 666 20.2
445 0.000 18.1 0.00 1.0 6.459 94.800 1.988 24.000 666 20.2
446 6.288 0.0 18.10 0.0 0.740 6.341 96.400 2.072 24 666.0
447 9.925 0.0 18.10 0.0 0.740 6.251 96.600 2.198 24 666.0
448 9.329 0.0 18.10 0.0 0.713 6.185 98.700 2.262 24 666.0
449 7.526 0.0 18.10 0.0 0.713 6.417 98.300 2.185 24 666.0
450 6.718 0.0 18.10 0.0 0.713 6.749 92.600 2.324 24 666.0
451 5.441 0.0 18.10 0.0 0.713 6.655 98.200 2.355 24 666.0
452 5.090 0.0 18.10 0.0 0.713 6.297 91.800 2.368 24 666.0
453 8.248 0.0 18.10 0.0 0.713 7.393 99.300 2.453 24 666.0
454 9.514 0.0 18.10 0.0 0.713 6.728 94.100 2.496 24 666.0
455 4.752 0.0 18.10 0.0 0.713 6.525 86.500 2.436 24 666.0
456 4.669 0.0 18.10 0.0 0.713 5.976 87.900 2.581 24 666.0
457 8.201 0.0 18.10 0.0 0.713 5.936 80.300 2.779 24 666.0
458 7.752 0.0 18.10 0.0 0.713 6.301 83.700 NaN 24 666.0
459 6.801 0.0 18.10 0.0 0.713 6.081 84.400 2.717 24 666.0
460 4.812 0.0 18.10 0.0 0.713 6.701 90.000 2.598 24 666.0
461 3.693 0.0 18.10 0.0 0.713 6.376 88.400 2.567 24 666.0
462 6.655 0.0 18.10 0.0 0.713 6.317 83.000 2.734 24 666.0
463 5.821 0.0 18.10 0.0 0.713 6.513 89.900 2.802 24 666.0
464 7.839 0.0 18.10 0.0 0.655 6.209 65.400 2.963 24 666.0
465 3.164 0.0 18.10 0.0 0.655 5.759 48.200 3.067 24 666.0
466 3.775 0.0 18.10 0.0 0.655 5.952 84.700 2.872 24 666.0
467 4.422 0.0 18.10 0.0 0.584 6.003 94.500 2.540 24 666.0
468 0.000 18.1 0.00 1.0 5.926 71.000 2.908 24.000 666 20.2
469 0.000 18.1 0.00 1.0 5.713 56.700 2.824 24.000 666 20.2
470 4.349 0.0 18.10 0.0 0.580 6.167 84.000 3.033 24 666.0
471 4.038 0.0 18.10 0.0 0.532 6.229 90.700 3.099 24 666.0
472 3.569 0.0 18.10 0.0 0.580 6.437 75.000 2.897 24 666.0
473 4.647 0.0 18.10 0.0 0.614 6.980 67.600 2.533 24 666.0
474 8.056 0.0 18.10 0.0 0.584 5.427 95.400 2.430 24 666.0
475 6.393 0.0 18.10 0.0 0.584 6.162 97.400 2.206 24 666.0
476 4.871 0.0 18.10 0.0 0.614 6.484 93.600 2.305 24 666.0
477 0.000 18.1 0.00 1.0 5.304 97.300 2.101 24.000 666 20.2
478 0.000 18.1 0.00 1.0 6.185 96.700 2.171 24.000 666 20.2
479 0.000 18.1 0.00 1.0 6.229 88.000 1.951 24.000 666 20.2
480 5.824 0.0 18.10 0.0 0.532 6.242 64.700 3.424 24 666.0
481 5.708 0.0 18.10 0.0 0.532 6.750 74.900 3.332 24 666.0
482 5.731 0.0 18.10 0.0 0.532 7.061 77.000 3.411 24 666.0
483 2.818 0.0 18.10 0.0 0.532 5.762 40.300 NaN 24 666.0
484 2.379 0.0 18.10 NaN 0.583 5.871 41.900 3.724 24 666.0
485 3.674 0.0 18.10 NaN 0.583 6.312 51.900 3.992 24 666.0
486 5.692 0.0 18.10 NaN 0.583 6.114 79.800 3.546 24 666.0
487 4.836 0.0 18.10 NaN 0.583 5.905 53.200 3.152 24 666.0
488 0.151 0.0 27.74 NaN 0.609 5.454 92.700 1.821 4 711.0
489 0.183 0.0 27.74 NaN 0.609 5.414 98.300 1.755 4 711.0
490 0.207 0.0 27.74 NaN 0.609 5.093 98.000 1.823 4 711.0
491 0.106 0.0 27.74 NaN 0.609 5.983 98.800 1.868 4 711.0
492 0.111 0.0 27.74 0.0 0.609 5.983 83.500 2.110 4 711.0
493 0.173 0.0 9.69 0.0 0.585 5.707 54.000 2.382 6 391.0
494 0.280 0.0 9.69 0.0 0.585 5.926 42.600 2.382 6 391.0
495 0.179 0.0 9.69 0.0 0.585 5.670 28.800 2.799 6 391.0
496 0.290 0.0 9.69 0.0 0.585 5.390 72.900 2.799 6 391.0
497 0.268 0.0 9.69 0.0 0.585 5.794 70.600 2.893 6 391.0
498 0.239 0.0 9.69 0.0 0.585 6.019 65.300 2.409 6 391.0
499 0.178 0.0 9.69 0.0 0.585 5.569 73.500 NaN 6 391.0
500 0.224 0.0 9.69 0.0 0.585 6.027 79.700 2.498 6 391.0
501 0.063 0.0 11.93 0.0 0.573 6.593 69.100 2.479 1 273.0
502 0.045 0.0 11.93 0.0 0.573 6.120 76.700 2.288 1 273.0
503 0.061 0.0 11.93 0.0 0.573 6.976 91.000 2.167 1 273.0
504 0.110 0.0 11.93 0.0 0.573 6.794 89.300 2.389 1 273.0
505 0.047 0.0 11.93 0.0 0.573 6.030 80.800 2.505 1 273.0
PTRATIO B LSTAT MEDV
0 15.30 396.90 4.98 24.0
1 17.80 396.90 9.14 21.6
2 17.80 392.83 4.03 34.7
3 18.70 NaN 2.94 33.4
4 18.70 396.90 5.33 36.2
5 18.70 394.12 5.21 28.7
6 15.20 395.60 12.43 22.9
7 15.20 396.90 19.15 27.1
8 15.20 386.63 29.93 16.5
9 15.20 386.71 17.10 18.9
10 15.20 392.52 20.45 15.0
11 15.20 396.90 13.27 18.9
12 15.20 390.50 15.71 21.7
13 21.00 396.90 8.26 20.4
14 21.00 380.02 10.26 18.2
15 21.00 395.62 8.47 19.9
16 21.00 386.85 6.58 23.1
17 21.00 386.75 14.67 17.5
18 21.00 288.99 11.69 20.2
19 21.00 390.95 11.28 18.2
20 21.00 NaN 21.02 13.6
21 21.00 392.53 13.83 19.6
22 21.00 396.90 18.72 15.2
23 21.00 394.54 19.88 14.5
24 21.00 394.33 16.30 15.6
25 21.00 303.42 16.51 13.9
26 21.00 376.88 14.81 16.6
27 21.00 306.38 17.28 14.8
28 21.00 387.94 12.80 18.4
29 21.00 380.23 11.98 21.0
30 21.00 360.17 22.60 12.7
31 21.00 376.73 13.04 14.5
32 21.00 232.60 27.71 13.2
33 21.00 358.77 18.35 13.1
34 21.00 248.31 20.34 13.5
35 19.20 396.90 9.68 18.9
36 19.20 377.56 11.41 20.0
37 19.20 NaN 8.77 21.0
38 19.20 393.43 10.13 24.7
39 18.30 395.63 4.32 30.8
40 18.30 395.62 1.98 34.9
41 17.90 385.41 4.84 26.6
42 17.90 383.37 5.81 25.3
43 17.90 394.46 7.44 24.7
44 17.90 389.39 9.55 21.2
45 17.90 396.90 10.21 19.3
46 17.90 396.90 14.15 20.0
47 17.90 392.74 18.80 16.6
48 17.90 396.90 30.81 14.4
49 17.90 396.90 16.20 19.4
50 16.80 NaN 13.45 19.7
51 16.80 393.97 9.43 20.5
52 16.80 396.90 5.28 25.0
53 16.80 396.90 8.43 23.4
54 21.10 396.90 14.80 18.9
55 17.90 395.93 4.81 35.4
56 17.30 396.90 5.77 24.7
57 15.10 392.90 3.95 31.6
58 19.70 390.68 6.86 23.3
59 19.70 396.90 9.22 19.6
60 19.70 395.11 13.15 18.7
61 19.70 378.08 14.44 16.0
62 19.70 396.90 6.73 22.2
63 19.70 395.58 9.50 25.0
64 18.60 393.24 8.05 33.0
65 16.10 396.90 4.67 23.5
66 16.10 396.90 10.24 19.4
67 18.90 396.21 8.10 22.0
68 18.90 396.90 13.09 17.4
69 18.90 396.90 8.79 20.9
70 19.20 383.73 6.72 24.2
71 19.20 376.94 9.88 21.7
72 19.20 390.91 5.52 22.8
73 19.20 377.17 7.54 23.4
74 18.70 394.92 6.78 24.1
75 18.70 383.23 8.94 21.4
76 18.70 373.66 11.97 20.0
77 18.70 386.96 10.27 20.8
78 18.70 386.40 12.34 21.2
79 18.70 396.06 9.10 20.3
80 19.00 396.90 5.29 28.0
81 19.00 395.63 7.22 23.9
82 19.00 396.90 6.72 24.8
83 19.00 390.64 7.51 22.9
84 18.50 396.90 9.62 23.9
85 18.50 392.30 6.53 26.6
86 18.50 395.99 12.86 22.5
87 18.50 395.15 8.44 22.2
88 17.80 396.90 5.50 23.6
89 17.80 396.06 5.70 28.7
90 17.80 392.18 8.81 22.6
91 17.80 393.55 8.20 22.0
92 18.20 395.01 8.16 22.9
93 18.20 396.33 6.21 25.0
94 18.20 396.90 10.59 20.6
95 18.00 357.98 6.65 28.4
96 18.00 391.83 11.34 21.4
97 18.00 396.90 4.21 38.7
98 18.00 393.53 3.57 43.8
99 18.00 396.90 6.19 33.2
100 20.90 394.76 9.42 27.5
101 20.90 395.58 7.67 26.5
102 20.90 70.80 10.63 18.6
103 20.90 394.47 13.44 19.3
104 20.90 392.69 12.33 20.1
105 20.90 394.05 16.47 19.5
106 20.90 395.67 18.66 19.5
107 20.90 387.69 14.09 20.4
108 20.90 395.24 12.27 19.8
109 20.90 391.23 15.55 19.4
110 20.90 393.49 13.00 21.7
111 17.80 395.59 10.16 22.8
112 17.80 394.95 16.21 18.8
113 17.80 396.90 17.09 18.7
114 17.80 388.74 10.45 18.5
115 17.80 NaN 15.76 18.3
116 17.80 393.30 12.04 21.2
117 17.80 394.51 10.30 19.2
118 17.80 338.63 15.37 20.4
119 17.80 391.50 13.61 19.3
120 19.10 389.15 14.37 22.0
121 19.10 377.67 14.27 20.3
122 19.10 378.09 17.93 20.5
123 19.10 370.31 25.41 17.3
124 19.10 379.38 17.58 18.8
125 19.10 385.02 14.81 21.4
126 19.10 359.29 27.26 15.7
127 21.20 392.11 17.19 16.2
128 21.20 396.90 15.39 18.0
129 21.20 396.90 18.34 14.3
130 21.20 395.04 12.60 19.2
131 21.20 396.90 12.26 19.6
132 21.20 385.76 11.12 23.0
133 21.20 388.69 15.03 18.4
134 21.20 262.76 17.31 15.6
135 21.20 394.67 16.96 18.1
136 21.20 378.25 16.90 17.4
137 21.20 394.08 14.59 17.1
138 21.20 392.04 21.32 13.3
139 21.20 396.90 18.46 17.8
140 21.20 NaN 24.16 14.0
141 21.20 396.90 34.41 14.4
142 14.70 NaN 26.82 13.4
143 14.70 396.90 26.42 15.6
144 14.70 396.90 29.29 11.8
145 14.70 172.91 27.80 13.8
146 14.70 169.27 16.65 15.6
147 14.70 391.71 29.53 14.6
148 14.70 356.99 28.32 17.8
149 14.70 351.85 21.45 15.4
150 14.70 372.80 14.10 21.5
151 14.70 341.60 13.28 19.6
152 14.70 343.28 12.12 15.3
153 14.70 261.95 15.79 19.4
154 14.70 321.02 15.12 17.0
155 14.70 88.01 15.02 15.6
156 14.70 88.63 16.14 13.1
157 14.70 363.43 4.59 41.3
158 14.70 353.89 6.43 24.3
159 14.70 364.31 7.39 23.3
160 14.70 338.92 5.50 27.0
161 14.70 374.43 1.73 50.0
162 14.70 389.61 1.92 50.0
163 14.70 388.45 3.32 50.0
164 14.70 395.11 11.64 22.7
165 14.70 240.16 9.81 25.0
166 14.70 369.30 3.70 50.0
167 14.70 227.61 12.14 23.8
168 14.70 297.09 11.10 23.8
169 14.70 330.04 11.32 22.3
170 14.70 NaN 14.43 17.4
171 14.70 348.13 12.03 19.1
172 16.60 396.90 14.69 23.1
173 16.60 395.50 9.04 23.6
174 16.60 393.23 9.64 22.6
175 16.60 390.96 5.33 29.4
176 16.60 393.23 10.11 23.2
177 16.60 395.60 6.29 24.6
178 16.60 391.27 6.92 29.9
179 17.80 396.90 5.04 37.2
180 17.80 395.56 7.56 39.8
181 17.80 396.90 9.45 36.2
182 17.80 394.12 4.82 37.9
183 17.80 396.90 5.68 32.5
184 17.80 391.00 13.98 26.4
185 17.80 387.11 13.15 29.6
186 17.80 392.63 4.45 50.0
187 15.20 393.87 6.68 32.0
188 15.20 382.84 4.56 29.8
189 15.20 396.90 5.39 34.9
190 15.20 377.68 5.10 37.0
191 15.20 389.71 4.69 30.5
192 15.20 390.49 2.87 36.4
193 15.60 393.37 5.03 31.1
194 15.60 376.70 4.38 29.1
195 14.40 394.23 2.97 50.0
196 12.60 396.90 4.08 33.3
197 12.60 354.31 8.61 30.3
198 12.60 392.20 6.62 34.6
199 17.00 396.90 4.56 34.9
200 17.00 384.30 4.45 32.9
201 14.70 393.77 7.43 24.1
202 14.70 395.38 3.11 42.3
203 14.70 392.78 3.81 48.5
204 14.70 390.55 2.88 50.0
205 18.60 396.90 10.87 22.6
206 18.60 394.87 10.97 24.4
207 18.60 389.43 18.06 22.5
208 18.60 381.32 14.66 24.4
209 18.60 396.90 23.09 20.0
210 18.60 393.25 17.27 21.7
211 18.60 395.24 23.98 19.3
212 18.60 390.94 16.03 22.4
213 18.60 385.81 9.38 28.1
214 18.60 348.93 29.55 23.7
215 18.60 393.63 9.47 25.0
216 16.40 392.80 13.51 23.3
217 16.40 392.78 9.69 28.7
218 16.40 396.90 17.92 21.5
219 16.40 393.74 10.50 23.0
220 17.40 NaN 9.71 26.7
221 17.40 395.24 21.46 21.7
222 17.40 390.39 9.93 27.5
223 17.40 396.90 7.60 30.1
224 17.40 385.05 4.14 44.8
225 17.40 NaN 4.63 50.0
226 17.40 387.38 3.13 37.6
227 17.40 372.08 6.36 31.6
228 17.40 377.51 3.92 46.7
229 17.40 380.34 3.76 31.5
230 17.40 378.35 11.65 24.3
231 17.40 376.14 5.25 31.7
232 17.40 385.91 2.47 41.7
233 17.40 NaN 3.95 48.3
234 17.40 360.20 8.05 29.0
235 17.40 NaN 10.88 24.0
236 17.40 388.45 9.54 25.1
237 17.40 390.07 4.73 31.5
238 16.60 379.41 6.36 23.7
239 16.60 383.78 7.37 23.3
240 16.60 391.25 11.38 22.0
241 16.60 394.62 12.40 20.1
242 16.60 372.75 11.22 22.2
243 16.60 374.71 5.19 23.7
244 19.10 372.49 12.50 17.6
245 19.10 389.13 18.46 18.5
246 19.10 390.18 9.16 24.3
247 19.10 376.14 10.15 20.5
248 19.10 374.71 9.52 24.5
249 19.10 393.74 6.56 26.2
250 19.10 396.28 5.90 24.4
251 19.10 377.07 3.59 24.8
252 19.10 386.09 3.53 29.6
253 19.10 396.90 3.54 42.8
254 16.40 392.89 6.57 21.9
255 16.40 395.18 9.25 20.9
256 15.90 386.34 3.11 44.0
257 13.00 389.70 5.12 50.0
258 13.00 383.29 7.79 36.0
259 13.00 391.93 6.90 30.1
260 13.00 392.80 9.59 33.8
261 13.00 388.37 7.26 43.1
262 13.00 386.86 5.91 48.8
263 13.00 393.42 11.25 31.0
264 13.00 387.89 8.10 36.5
265 13.00 392.40 10.45 22.8
266 13.00 384.07 14.79 30.7
267 13.00 384.54 7.44 50.0
268 13.00 390.30 3.16 43.5
269 18.60 391.34 13.65 20.7
270 18.60 388.65 13.00 21.1
271 18.60 396.90 6.59 25.2
272 18.60 394.96 7.73 24.4
273 18.60 390.77 6.58 35.2
274 17.60 396.90 3.53 32.4
275 17.60 396.90 2.98 32.0
276 17.60 389.25 6.05 33.2
277 17.60 393.45 4.16 33.1
278 17.60 396.90 7.19 29.1
279 14.90 396.90 4.85 35.1
280 14.90 387.31 3.76 45.4
281 14.90 392.23 4.59 35.4
282 14.90 377.07 3.01 46.0
283 13.60 395.52 3.16 50.0
284 15.30 394.72 7.85 32.2
285 15.30 394.72 8.23 22.0
286 18.20 341.60 12.93 20.1
287 16.60 396.90 7.14 23.2
288 16.60 396.90 7.60 22.3
289 16.60 371.72 9.51 24.8
290 19.20 396.90 3.33 28.5
291 19.20 396.90 3.56 37.3
292 19.20 396.90 4.70 27.9
293 16.00 396.90 8.58 23.9
294 16.00 396.90 10.40 21.7
295 16.00 396.90 6.27 28.6
296 16.00 392.85 7.39 27.1
297 16.00 396.90 15.84 20.3
298 14.80 368.24 4.97 22.5
299 14.80 371.58 4.74 29.0
300 14.80 390.86 6.07 24.8
301 16.10 395.75 9.50 22.0
302 16.10 383.61 8.67 26.4
303 16.10 390.43 4.86 33.1
304 18.40 393.68 6.93 36.1
305 18.40 393.36 8.93 28.4
306 18.40 396.90 6.47 33.4
307 18.40 396.90 7.53 28.2
308 18.40 396.90 4.54 22.8
309 18.40 396.24 9.97 20.3
310 18.40 350.45 12.64 16.1
311 18.40 396.90 5.98 22.1
312 18.40 396.30 11.72 19.4
313 18.40 393.39 7.90 21.6
314 18.40 395.69 9.28 23.8
315 18.40 396.42 11.50 16.2
316 18.40 390.70 18.33 17.8
317 18.40 396.90 15.94 19.8
318 18.40 395.21 10.36 23.1
319 18.40 396.23 12.73 21.0
320 19.60 396.90 7.20 23.8
321 19.60 396.90 6.87 23.1
322 19.60 396.90 7.70 20.4
323 19.60 391.13 11.74 18.5
324 19.60 396.90 6.12 25.0
325 19.60 393.68 5.08 24.6
326 19.60 396.90 6.15 23.0
327 19.60 396.90 12.79 22.2
328 16.90 382.44 9.97 19.3
329 16.90 375.21 7.34 22.6
330 16.90 368.57 9.09 19.8
331 16.90 394.02 12.43 17.1
332 16.90 362.25 7.83 19.4
333 20.20 389.71 5.68 22.2
334 20.20 389.40 6.75 20.7
335 20.20 396.90 8.01 21.1
336 20.20 396.90 9.80 19.5
337 20.20 394.81 10.56 18.5
338 20.20 396.14 8.51 20.6
339 20.20 396.90 9.74 19.0
340 20.20 396.90 9.29 18.7
341 15.50 394.74 5.49 32.7
342 15.90 389.96 8.65 16.5
343 17.60 396.90 7.18 23.9
344 17.60 387.97 4.61 31.2
345 18.80 385.64 10.53 17.5
346 18.80 364.61 12.67 17.2
347 17.90 392.43 6.36 23.1
348 17.00 390.94 5.99 24.5
349 19.70 389.85 5.89 26.6
350 19.70 396.90 5.98 22.9
351 18.30 370.78 5.49 24.1
352 18.30 392.33 7.79 18.6
353 17.00 384.46 4.50 30.1
354 22.00 382.80 8.05 18.2
355 22.00 376.04 5.57 20.6
356 20.20 377.73 17.60 17.8
357 20.20 391.34 13.27 21.7
358 20.20 395.43 11.48 22.7
359 20.20 390.74 12.67 22.6
360 20.20 374.56 7.79 25.0
361 20.20 350.65 14.19 19.9
362 20.20 380.79 10.19 20.8
363 20.20 353.04 14.64 16.8
364 20.20 354.55 5.29 21.9
365 20.20 354.70 7.12 27.5
366 20.20 316.03 14.00 21.9
367 131.42 NaN 23.10 NaN
368 20.20 375.52 3.26 50.0
369 20.20 375.33 3.73 50.0
370 20.20 392.05 2.96 50.0
371 20.20 366.15 9.53 50.0
372 20.20 347.88 8.88 50.0
373 396.90 34.77 13.80 NaN
374 396.90 NaN 13.80 NaN
375 396.90 NaN 15.00 NaN
376 363.02 NaN 13.90 NaN
377 20.20 396.90 21.24 13.3
378 396.90 23.69 13.10 NaN
379 393.74 21.78 10.20 NaN
380 396.90 17.21 10.40 NaN
381 396.90 21.08 10.90 NaN
382 20.20 396.90 23.60 11.3
383 20.20 396.90 24.56 12.3
384 285.83 30.63 8.80 NaN
385 396.90 30.81 7.20 NaN
386 396.90 28.28 10.50 NaN
387 396.90 31.99 7.40 NaN
388 372.92 30.62 10.20 NaN
389 20.20 396.90 20.85 11.5
390 20.20 394.43 17.11 15.1
391 20.20 378.38 18.76 23.2
392 396.90 25.68 9.70 NaN
393 20.20 396.90 15.17 13.8
394 396.90 16.35 12.70 NaN
395 20.20 391.98 17.12 13.1
396 20.20 396.90 19.37 12.5
397 20.20 393.10 19.92 8.5
398 396.90 30.59 5.00 NaN
399 20.20 338.16 29.97 6.3
400 396.90 26.77 5.60 NaN
401 396.90 20.32 7.20 NaN
402 20.20 376.11 20.31 12.1
403 396.90 19.77 8.30 NaN
404 329.46 27.38 8.50 NaN
405 384.97 NaN 5.00 NaN
406 370.22 23.34 11.90 NaN
407 332.09 12.13 27.90 NaN
408 20.20 314.64 26.40 17.2
409 179.36 19.78 27.50 NaN
410 2.60 10.11 15.00 NaN
411 35.05 21.22 17.20 NaN
412 28.79 34.37 17.90 NaN
413 210.97 20.08 16.30 NaN
414 88.27 36.98 7.00 NaN
415 27.25 29.05 7.20 NaN
416 21.57 25.79 7.50 NaN
417 127.36 NaN 10.40 NaN
418 16.45 20.62 8.80 NaN
419 48.45 22.74 8.40 NaN
420 318.75 15.02 16.70 NaN
421 20.20 319.98 15.70 14.2
422 291.55 14.10 20.80 NaN
423 20.20 2.52 23.29 13.4
424 20.20 3.65 17.16 11.7
425 7.68 24.39 8.30 NaN
426 24.65 15.69 10.20 NaN
427 18.82 14.52 10.90 NaN
428 20.20 96.73 21.52 11.0
429 20.20 60.72 24.08 9.5
430 20.20 83.45 17.64 14.5
431 81.33 19.69 14.10 NaN
432 20.20 97.95 12.03 16.1
433 20.20 100.19 16.22 14.3
434 100.63 15.17 11.70 NaN
435 109.85 23.27 13.40 NaN
436 27.49 18.05 9.60 NaN
437 9.32 NaN 8.70 NaN
438 68.95 34.02 8.40 NaN
439 20.20 396.90 22.88 12.8
440 391.45 22.11 10.50 NaN
441 20.20 385.96 19.52 17.1
442 20.20 395.69 16.59 18.4
443 20.20 386.73 18.85 15.4
444 240.52 23.79 10.80 NaN
445 43.06 23.98 11.80 NaN
446 20.20 318.01 17.79 14.9
447 20.20 388.52 16.44 12.6
448 20.20 396.90 18.13 14.1
449 20.20 304.21 19.31 13.0
450 20.20 0.32 17.44 13.4
451 20.20 355.29 17.73 15.2
452 20.20 385.09 17.27 16.1
453 20.20 375.87 16.74 17.8
454 20.20 6.68 18.71 14.9
455 20.20 50.92 18.13 14.1
456 20.20 10.48 19.01 12.7
457 20.20 3.50 16.94 13.5
458 20.20 272.21 16.23 14.9
459 20.20 396.90 14.70 20.0
460 20.20 255.23 16.42 16.4
461 20.20 391.43 14.65 17.7
462 20.20 396.90 13.99 19.5
463 20.20 393.82 10.29 20.2
464 20.20 396.90 13.22 21.4
465 20.20 334.40 14.13 19.9
466 20.20 22.01 17.15 19.0
467 20.20 331.29 21.32 19.1
468 368.74 18.13 19.10 NaN
469 396.90 14.76 20.10 NaN
470 20.20 396.90 16.29 19.9
471 20.20 395.33 12.87 19.6
472 20.20 393.37 14.36 23.2
473 20.20 374.68 11.66 29.8
474 20.20 352.58 18.14 13.8
475 20.20 302.76 24.10 13.3
476 20.20 396.21 18.68 16.7
477 349.48 24.91 12.00 NaN
478 379.70 NaN 14.60 NaN
479 383.32 13.11 21.40 NaN
480 20.20 396.90 10.74 23.0
481 20.20 393.07 7.74 23.7
482 20.20 395.28 7.01 25.0
483 20.20 392.92 10.42 21.8
484 20.20 370.73 13.34 20.6
485 20.20 388.62 10.58 21.2
486 20.20 392.68 14.98 19.1
487 20.20 388.22 11.45 20.6
488 20.10 395.09 18.06 15.2
489 20.10 344.05 23.97 7.0
490 20.10 318.43 29.68 8.1
491 20.10 390.11 18.07 13.6
492 20.10 396.90 13.35 20.1
493 19.20 396.90 12.01 21.8
494 19.20 396.90 13.59 24.5
495 19.20 393.29 17.60 23.1
496 19.20 396.90 21.14 19.7
497 19.20 396.90 14.10 18.3
498 19.20 396.90 12.92 21.2
499 19.20 395.77 15.10 17.5
500 19.20 396.90 14.33 16.8
501 21.00 391.99 9.67 22.4
502 21.00 396.90 9.08 20.6
503 21.00 396.90 5.64 23.9
504 21.00 393.45 6.48 22.0
505 21.00 396.90 7.88 11.9
Q2.There might be some missing values in each column. For each feature, obtain the number and proportion of missing data in Table¶
df.isnull().sum()
CRIM 0 ZN 0 INDUS 0 CHAS 26 NOX 0 RM 0 AGE 0 DIS 27 RAD 0 TAX 0 PTRATIO 0 B 20 LSTAT 0 MEDV 54 dtype: int64
(df.isnull().sum() / len(df)) * 100
CRIM 0.000000 ZN 0.000000 INDUS 0.000000 CHAS 5.138340 NOX 0.000000 RM 0.000000 AGE 0.000000 DIS 5.335968 RAD 0.000000 TAX 0.000000 PTRATIO 0.000000 B 3.952569 LSTAT 0.000000 MEDV 10.671937 dtype: float64
Q3. Graph the number of unique values for each feature and explain them.¶
As seen above, the less the unique features,the more probability of being categorical feature or discrete feature.
fig = plt.figure(figsize = (15,20))
ax = fig.gca()
df.hist(bins=5,ax=ax,color = "skyblue",ec="skyblue")
array([[<Axes: title={'center': 'CRIM'}>, <Axes: title={'center': 'ZN'}>,
<Axes: title={'center': 'INDUS'}>,
<Axes: title={'center': 'CHAS'}>],
[<Axes: title={'center': 'NOX'}>, <Axes: title={'center': 'RM'}>,
<Axes: title={'center': 'AGE'}>, <Axes: title={'center': 'DIS'}>],
[<Axes: title={'center': 'RAD'}>, <Axes: title={'center': 'TAX'}>,
<Axes: title={'center': 'PTRATIO'}>,
<Axes: title={'center': 'B'}>],
[<Axes: title={'center': 'LSTAT'}>,
<Axes: title={'center': 'MEDV'}>, <Axes: >, <Axes: >]],
dtype=object)
cmaps = ['Greys', 'Purples', 'Blues', 'Greens', 'Oranges', 'Reds',
'YlOrBr', 'YlOrRd', 'OrRd', 'PuRd', 'RdPu', 'BuPu',
'GnBu', 'PuBu', 'YlGnBu', 'PuBuGn', 'BuGn', 'YlGn']
colors = ["red","green","blue","orange","pink","purple","brown"]
i = 0
df['CRIM'].hist(color = colors[i//2])
plt.title("CRIM")
Text(0.5, 1.0, 'CRIM')
i += 1
df['ZN'].hist(color = colors[i//2])
plt.title("ZN")
Text(0.5, 1.0, 'ZN')
i+=1
df['INDUS'].hist(color = colors[i//2])
plt.title("INDUS")
Text(0.5, 1.0, 'INDUS')
i+=1
df['NOX'].hist(color = colors[i//2])
plt.title("NOX")
Text(0.5, 1.0, 'NOX')
i+=1
df['RM'].hist(color = colors[i//2])
plt.title("RM")
Text(0.5, 1.0, 'RM')
i+=1
df['AGE'].hist(color = colors[i//2])
plt.title("AGE")
Text(0.5, 1.0, 'AGE')
i+=1
df['DIS'].hist(color = colors[i//2])
plt.title("DIS")
Text(0.5, 1.0, 'DIS')
i+=1
df['RAD'].hist(color = colors[i//2])
plt.title("RAD")
Text(0.5, 1.0, 'RAD')
i+=1
df['TAX'].hist(color = colors[i//2])
plt.title("TAX")
Text(0.5, 1.0, 'TAX')
i+=1
df['PTRATIO'].hist(color = colors[i//2])
plt.title("PTRATIO")
Text(0.5, 1.0, 'PTRATIO')
i+=1
df['B'].hist(color = colors[i//2])
plt.title("B")
Text(0.5, 1.0, 'B')
i+=1
df['LSTAT'].hist(color = colors[i//2])
plt.title("LSTAT")
Text(0.5, 1.0, 'LSTAT')
i+=1
df['MEDV'].hist(color = colors[i//2])
plt.title("MEDV")
Text(0.5, 1.0, 'MEDV')
nu = df.nunique().reset_index()
nu
| index | 0 | |
|---|---|---|
| 0 | CRIM | 452 |
| 1 | ZN | 27 |
| 2 | INDUS | 77 |
| 3 | CHAS | 2 |
| 4 | NOX | 132 |
| 5 | RM | 437 |
| 6 | AGE | 399 |
| 7 | DIS | 343 |
| 8 | RAD | 10 |
| 9 | TAX | 67 |
| 10 | PTRATIO | 85 |
| 11 | B | 357 |
| 12 | LSTAT | 445 |
| 13 | MEDV | 210 |
plt.figure(figsize=(10,5))
plt.title("number of unique vals for every f ")
plt.xticks(rotation = 80)
nu.columns = ['feature','num of unique vals']
ax = sns.barplot(x='feature', y='num of unique vals', data=nu)
Q4. Draw a diagram of the dependence of the features on each other.Explain about this diagram and tell which features are more dependent on the target column?¶
In machine learning, understanding the dependence of features on each other is crucial for building accurate models. Let’s delve into this topic¶
Feature Interaction:
When features interact with each other in a prediction model, the prediction cannot be expressed as the simple sum of individual feature effects. This is because the effect of one feature depends on the value of another feature.¶
For example, consider a model predicting house prices using two features: house size (big or small) and location (good or bad). If there’s no interaction, the prediction would be a sum of the individual feature effects.¶
However, when interaction exists, the prediction becomes more complex. Let’s break it down:¶
Without Interaction:
Suppose we have the following predictions:¶
Good location, big house: 400,000¶
Good location, small house: 200,000¶
Bad location, big house: 250,000¶
Bad location, small house: 150,000¶
Decomposing the prediction:¶
Constant term: 150,000¶
Size effect (big vs. small): +100,000 (always)¶
Location effect (good vs. bad): +50,000 (always)¶
No interaction effect because the prediction is a sum of individual feature effects.¶
With Interaction: Now let’s consider a different scenario:
Good location, big house: 400,000¶
Good location, small house: 200,000¶
Bad location, big house: 250,000¶
Bad location, small house: 150,000¶
Decomposing the prediction:¶
Constant term: 150,000¶
Size effect (big vs. small): +100,000 (always)¶
Location effect (good vs. bad): +50,000 (always)¶
Interaction effect (big house in good location): +100,000¶
Here, the interaction depends on both size and location. When a house is big and in a good location, the prediction increases by an additional $100,000.¶
Measuring Interaction Strength:
One way to estimate interaction strength is using the H-statistic, introduced by Friedman and Popescu (2008). It measures how much prediction variation depends on feature interactions.¶
Visualizing interaction effects can be done using techniques like Partial Dependence Plots (PDP), which show how features vary with predictions123.¶
As seen above,RM and after that ZN and B are more dependent on the target column.RM has approximately a linear relationship with MDEV and LSTAT has negative linear relationship with MDEV.
plt.figure(figsize=(12, 6))
sns.heatmap(df.corr(),
cmap = 'BrBG',
fmt = '.2f',
linewidths = 2,
annot = True)
<Axes: >
Q5. Scatter and hexbin plots are usually used to check feature correlations. Use these charts to check dependencies with the target column. Briefly explain the use and meaning of each.¶
Let’s explore scatter plots and hexbin plots, both of which are valuable tools for visualizing data and understanding feature dependencies:
Scatter Plots: A scatter plot is a common type of 2-dimensional plot used to display the distribution of data points. In a scatter plot, each data point is represented as a single dot on the graph.
Use: Scatter plots are useful for showing the relationship between two continuous variables (usually X and Y).
They help identify patterns, trends, and potential outliers.
Scatter plots are commonly used in exploratory data analysis (EDA) and regression analysis.
Meaning: The position of each dot represents the values of the two variables (X and Y) for a specific data point.
The scatter plot allows us to visually assess whether there is a linear or non-linear relationship between the variables.
The closer the dots are to forming a straight line, the stronger the correlation between the variables.
Hexbin Plots: A hexbin plot is an alternative to scatter plots, especially when dealing with large datasets. Unlike traditional scatter plots, where each data point is a single dot, hexbin plots aggregate data points into hexagonal bins.
Use: Scatter plots are useful for showing the relationship between two continuous variables (usually X and Y).
They help identify patterns, trends, and potential outliers.
Scatter plots are commonly used in exploratory data analysis (EDA) and regression analysis.
Meaning: The hexagons represent regions of the graphing space.
The color of each hexagon is determined by aggregating the data points within that hexagon (usually by taking the mean of a third variable, such as the target column).
The color gradient indicates the density of data points within each hexagonal area.
Hexbin plots are especially powerful when visualizing geospatial data, as they resemble maps and can reveal spatial patterns.
eveal spatial patterns. In summary, scatter plots are great for exploring relationships between two continuous variables, while hexbin plots are excellent for handling large datasets and spatially dependent data. Both provide valuable insights into feature dependencies!
i = 0
df.plot.scatter(x= 'MEDV',y = 'CRIM',color = colors[i//2])
i += 1
df.plot.scatter(x= 'MEDV',y = 'ZN',color = colors[i//2])
i += 1
df.plot.scatter(x= 'MEDV',y = 'INDUS',color = colors[i//2])
i += 1
df.plot.scatter(x= 'MEDV',y = 'CHAS',color = colors[i//2])
i += 1
df.plot.scatter(x= 'MEDV',y = 'NOX',color = colors[i//2])
i += 1
df.plot.scatter(x= 'MEDV',y = 'RM',color = colors[i//2])
i += 1
df.plot.scatter(x= 'MEDV',y = 'AGE',color = colors[i//2])
i += 1
df.plot.scatter(x= 'MEDV',y = 'DIS',color = colors[i//2])
i += 1
df.plot.scatter(x= 'MEDV',y = 'RAD',color = colors[i//2])
i += 1
df.plot.scatter(x= 'MEDV',y = 'TAX',color = colors[i//2])
i += 1
df.plot.scatter(x= 'MEDV',y = 'PTRATIO',color = colors[i//2])
i += 1
df.plot.scatter(x= 'MEDV',y = 'B',color = colors[i//2])
i += 1
df.plot.scatter(x= 'MEDV',y = 'LSTAT',color = colors[i//2])
<Axes: xlabel='MEDV', ylabel='LSTAT'>
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
fig, axes = plt.subplots(nrows=len(numerical_cols), ncols=1, figsize=(6, 6*len(numerical_cols)))
for i, col in enumerate(numerical_cols):
axes[i].hexbin(df[col], df["MEDV"], gridsize=30, cmap=cmaps[i])
axes[i].set_title(f"Hexbin of {col} vs MEDV")
axes[i].set_xlabel(col)
axes[i].set_ylabel("MEDV")
plt.tight_layout()
plt.show()
i = 0
for column in df.columns:
value_counts = df[column].value_counts()
if (len(value_counts) < 50):
plt.figure(figsize=(10, 5))
value_counts.plot(kind='bar',color = colors[i // 2] )
plt.title(f'Number of observations for each unique value of {column}')
plt.xlabel('Unique Values')
plt.ylabel('Number of Observations')
plt.show()
elif df[column].dtype == 'int64' or df[column].dtype == 'float64':
plt.figure(figsize=(10, 5))
df[column].plot(kind='hist', bins=30,color = colors[i //2 ])
plt.title(f'Distribution of {column}')
plt.xlabel(column)
plt.ylabel('Frequency')
plt.show()
i += 1
Q6. Research other checking methods you can use for the dataset and implement one.¶
Let’s explore some other data validation and checking methods that you can use for your dataset. Ensuring data quality and accuracy is crucial for reliable analysis. Here are a few techniques and tools you can consider:
Manual Data Validation: Manually inspect and validate the data by comparing it against predefined rules or criteria.
Advantages: Simple and straightforward.
Useful for small datasets.
Disadvantages: Time-consuming for large datasets.
Prone to human error.
Automated Data Validation Tools: Use specialized tools to automatically validate data based on predefined rules.Examples:
Google Data Validation Tool: An open-source tool that checks data quality and consistency.
DataTest: A Python package for data validation.
Colander: Another Python package for data validation.
Voluptuous: A Python library for defining and validating data schemas.
Arcion: A cloud service for data validation.
TravisCI: Continuous integration and deployment tools that offer automated data validation.
Cross-Validation: Split your dataset into training and validation sets.
Use different subsets of the data for training and validation to assess model performance.
Techniques include k-fold cross-validation, leave-one-out cross-validation, and stratified cross-validatoin
Statistical Checks: Perform statistical tests to identify outliers, missing values, and inconsistencies.Examples:
Range Check: Ensure data falls within expected ranges.
Format Check: Validate data format (e.g., date formats, phone numbers).
Consistency Check: Verify relationships between related data fields.
Uniqueness Check: Ensure unique identifiers (e.g., primary key) s
Machine Learning-Based Checks: Train a machine learning model to predict missing or incorrect values.
Use imputation techniques or anomaly detection algorithms.didation ensures the reliability of your analyses!the reliability of your analyses!
fig, axes = plt.subplots( nrows=len(df.columns), ncols=1, figsize=(7, 4*len(df.columns)))
for i, col in enumerate(df.columns):
axes[i].scatter( df[col] , df["MEDV"] , c=colors[i//2], alpha=0.5, marker=r'$\clubsuit$',
label="Luck")
axes[i].set_title(col + ' to MEDV' + " relation")
axes[i].set_xlabel(col)
axes[i].set_ylabel("MEDV")
plt.tight_layout()
plt.show()
2)Preprocessing Data¶
Q7. Explain the methods of filling Missing Values and implement at least three methods. Briefly mention the reason for using each method¶
Let’s explore some methods for handling missing values in a dataset and implement three of them using Python. Missing data can occur due to various reasons, such as data collection errors, sensor failures, or user omissions. It’s essential to address missing values to ensure accurate and reliable analyses.
Here are three common methods for filling missing values:
Mean/Median/Mode Imputation:
Method: Replace missing values with the mean (for numerical features), median (for skewed distributions), or mode (for categorical features) of the respective column.
Reason: This method is straightforward and works well when the missing values are missing at random (MAR). It helps maintain the overall distribution of the feature.
Forward Fill (ffill) and Backward Fill (bfill):
Method: Forward Fill (ffill): Propagate the last observed non-null value forward to fill missing values. Backward Fill (bfill): Propagate the next non-null value backward to fill missing values.
Reason: These methods are useful for time-series data, where missing values can be interpolated based on the previous or subsequent values.
Interpolation:
Method: Use interpolation techniques (such as linear, polynomial, or spline interpolation) to estimate missing values based on neighboring data points.
Reason: Interpolation provides a more sophisticated way to estimate missing values, especially when the data follows a trend or pattern.
df
| CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | MEDV | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.00632 | 18.0 | 2.31 | 0.0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296.0 | 15.3 | 396.90 | 4.98 | 24.0 |
| 1 | 0.02731 | 0.0 | 7.07 | 0.0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242.0 | 17.8 | 396.90 | 9.14 | 21.6 |
| 2 | 0.02729 | 0.0 | 7.07 | 0.0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242.0 | 17.8 | 392.83 | 4.03 | 34.7 |
| 3 | 0.03237 | 0.0 | 2.18 | 0.0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3 | 222.0 | 18.7 | NaN | 2.94 | 33.4 |
| 4 | 0.06905 | 0.0 | 2.18 | 0.0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3 | 222.0 | 18.7 | 396.90 | 5.33 | 36.2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 501 | 0.06263 | 0.0 | 11.93 | 0.0 | 0.573 | 6.593 | 69.1 | 2.4786 | 1 | 273.0 | 21.0 | 391.99 | 9.67 | 22.4 |
| 502 | 0.04527 | 0.0 | 11.93 | 0.0 | 0.573 | 6.120 | 76.7 | 2.2875 | 1 | 273.0 | 21.0 | 396.90 | 9.08 | 20.6 |
| 503 | 0.06076 | 0.0 | 11.93 | 0.0 | 0.573 | 6.976 | 91.0 | 2.1675 | 1 | 273.0 | 21.0 | 396.90 | 5.64 | 23.9 |
| 504 | 0.10959 | 0.0 | 11.93 | 0.0 | 0.573 | 6.794 | 89.3 | 2.3889 | 1 | 273.0 | 21.0 | 393.45 | 6.48 | 22.0 |
| 505 | 0.04741 | 0.0 | 11.93 | 0.0 | 0.573 | 6.030 | 80.8 | 2.5050 | 1 | 273.0 | 21.0 | 396.90 | 7.88 | 11.9 |
506 rows × 14 columns
# df.ffill(axis = 0)
# df.bfill(axis = 0)
# df["B"].fillna( method ='ffill',limit = 1) #, inplace = True
# df
median = df['B'].median()
mean = df['B'].mean()
std = df['B'].std()
min_ = df["B"].min()
max_ = df["B"].max()
print(f"B Median: {median:.2f}")
print(f"B Mean: {mean:.2f}")
print(f"B Standard Deviation: {std:.2f}")
print(f"B min: {min_:.2f}")
print(f"B max: {max_:.2f}")
B Median: 390.88 B Mean: 336.82 B Standard Deviation: 121.17 B min: 0.32 B max: 396.90
df['B'].fillna(df['B'].median(),inplace =True)
median = df['CHAS'].median()
mean = df['CHAS'].mean()
std = df['CHAS'].std()
min_ = df["CHAS"].min()
max_ = df["CHAS"].max()
print(f"CHAS Median: {median:.2f}")
print(f"CHAS Mean: {mean:.2f}")
print(f"CHAS Standard Deviation: {std:.2f}")
print(f"CHAS min: {min_:.2f}")
print(f"CHAS max: {max_:.2f}")
CHAS Median: 0.00 CHAS Mean: 0.17 CHAS Standard Deviation: 0.38 CHAS min: 0.00 CHAS max: 1.00
df['CHAS'].fillna((df['CHAS'].median()),inplace =True)
median = df['DIS'].median()
mean = df['DIS'].mean()
std = df['DIS'].std()
min_ = df["DIS"].min()
max_ = df["DIS"].max()
print(f"DIS Median: {median:.2f}")
print(f"DIS Mean: {mean:.2f}")
print(f"DIS Standard Deviation: {std:.2f}")
print(f"DIS min: {min_:.2f}")
print(f"DIS max: {max_:.2f}")
DIS Median: 3.92 DIS Mean: 6.21 DIS Standard Deviation: 6.53 DIS min: 1.13 DIS max: 24.00
df['DIS'].fillna((df['DIS'].mean()),inplace =True)
median = df['MEDV'].median()
mean = df['MEDV'].mean()
std = df['MEDV'].std()
min_ = df["MEDV"].min()
max_ = df["MEDV"].max()
print(f"MEDV Median: {median:.2f}")
print(f"MEDV Mean: {mean:.2f}")
print(f"MEDV Standard Deviation: {std:.2f}")
print(f"MEDV min: {min_:.2f}")
print(f"MEDV max: {max_:.2f}")
MEDV Median: 21.95 MEDV Mean: 23.75 MEDV Standard Deviation: 8.81 MEDV min: 6.30 MEDV max: 50.00
df['MEDV'].fillna((df['MEDV'].mean()),inplace =True)
df.isnull().sum()
CRIM 0 ZN 0 INDUS 0 CHAS 0 NOX 0 RM 0 AGE 0 DIS 0 RAD 0 TAX 0 PTRATIO 0 B 0 LSTAT 0 MEDV 0 dtype: int64
Q8. Is it possible to delete some columns? Why? If it is possible, delete the necessary columns by mentioning the reason.¶
Yes, it is possible to delete or drop columns from a DataFrame, and there are a variety of reasons why you might want to do this:
Irrelevant Data: The column may not contain information relevant to the problem you are trying to solve or the analysis you are conducting.
Data Leakage: The column could contain data that would not be available at the time of making predictions (future data), and using this data during training could lead to overfitting.
Redundant Information: Sometimes different columns can contain overlapping information. To reduce dimensionality and multicollinearity (which can be problematic for certain types of models), you might drop one of the redundant columns.
Noisy Data: If a column contains too much noise, it might actually decrease the model's performance.
Too Many Missing Values: If a column contains too many missing values, it might not be practical to impute or fill those missing values, and dropping the column could be a better solution.
Computational Efficiency: Reducing the number of features can lead to faster computation, which can be important when working with large datasets or complex models.
Improve Model Performance: Sometimes models perform better with a smaller set of features. Dropping irrelevant or less important features can potentially improve model performance.
Yes, here, as you can see in the correlation chart, for example, there is high relationship between RM,NOX,RAD,DIS so we delete two of them(DIS and NOX).
df
| CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | MEDV | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.00632 | 18.0 | 2.31 | 0.0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296.0 | 15.3 | 396.900 | 4.98 | 24.0 |
| 1 | 0.02731 | 0.0 | 7.07 | 0.0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242.0 | 17.8 | 396.900 | 9.14 | 21.6 |
| 2 | 0.02729 | 0.0 | 7.07 | 0.0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242.0 | 17.8 | 392.830 | 4.03 | 34.7 |
| 3 | 0.03237 | 0.0 | 2.18 | 0.0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3 | 222.0 | 18.7 | 390.885 | 2.94 | 33.4 |
| 4 | 0.06905 | 0.0 | 2.18 | 0.0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3 | 222.0 | 18.7 | 396.900 | 5.33 | 36.2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 501 | 0.06263 | 0.0 | 11.93 | 0.0 | 0.573 | 6.593 | 69.1 | 2.4786 | 1 | 273.0 | 21.0 | 391.990 | 9.67 | 22.4 |
| 502 | 0.04527 | 0.0 | 11.93 | 0.0 | 0.573 | 6.120 | 76.7 | 2.2875 | 1 | 273.0 | 21.0 | 396.900 | 9.08 | 20.6 |
| 503 | 0.06076 | 0.0 | 11.93 | 0.0 | 0.573 | 6.976 | 91.0 | 2.1675 | 1 | 273.0 | 21.0 | 396.900 | 5.64 | 23.9 |
| 504 | 0.10959 | 0.0 | 11.93 | 0.0 | 0.573 | 6.794 | 89.3 | 2.3889 | 1 | 273.0 | 21.0 | 393.450 | 6.48 | 22.0 |
| 505 | 0.04741 | 0.0 | 11.93 | 0.0 | 0.573 | 6.030 | 80.8 | 2.5050 | 1 | 273.0 | 21.0 | 396.900 | 7.88 | 11.9 |
506 rows × 14 columns
df = df.drop(columns = ["NOX","DIS"])
df
| CRIM | ZN | INDUS | CHAS | RM | AGE | RAD | TAX | PTRATIO | B | LSTAT | MEDV | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.00632 | 18.0 | 2.31 | 0.0 | 6.575 | 65.2 | 1 | 296.0 | 15.3 | 396.900 | 4.98 | 24.0 |
| 1 | 0.02731 | 0.0 | 7.07 | 0.0 | 6.421 | 78.9 | 2 | 242.0 | 17.8 | 396.900 | 9.14 | 21.6 |
| 2 | 0.02729 | 0.0 | 7.07 | 0.0 | 7.185 | 61.1 | 2 | 242.0 | 17.8 | 392.830 | 4.03 | 34.7 |
| 3 | 0.03237 | 0.0 | 2.18 | 0.0 | 6.998 | 45.8 | 3 | 222.0 | 18.7 | 390.885 | 2.94 | 33.4 |
| 4 | 0.06905 | 0.0 | 2.18 | 0.0 | 7.147 | 54.2 | 3 | 222.0 | 18.7 | 396.900 | 5.33 | 36.2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 501 | 0.06263 | 0.0 | 11.93 | 0.0 | 6.593 | 69.1 | 1 | 273.0 | 21.0 | 391.990 | 9.67 | 22.4 |
| 502 | 0.04527 | 0.0 | 11.93 | 0.0 | 6.120 | 76.7 | 1 | 273.0 | 21.0 | 396.900 | 9.08 | 20.6 |
| 503 | 0.06076 | 0.0 | 11.93 | 0.0 | 6.976 | 91.0 | 1 | 273.0 | 21.0 | 396.900 | 5.64 | 23.9 |
| 504 | 0.10959 | 0.0 | 11.93 | 0.0 | 6.794 | 89.3 | 1 | 273.0 | 21.0 | 393.450 | 6.48 | 22.0 |
| 505 | 0.04741 | 0.0 | 11.93 | 0.0 | 6.030 | 80.8 | 1 | 273.0 | 21.0 | 396.900 | 7.88 | 11.9 |
506 rows × 12 columns
Q9. Which features are called numerical and which are called categorical? What is the difference between these two types of features? Identify numerical and categorical characteristics in this data set.¶
Let’s discuss the difference between numerical and categorical features and identify them in the given dataset.
Numerical Features (Quantitative Data): Numerical features are expressed as numbers and can be measured or counted.
They represent quantities or amounts.
Examples of numerical features include: Age ,Height,Income,Temperature,Number of children .
Categorical Features (Qualitative Data): Categorical features represent categories or labels.
They do not have inherent numerical meaning.
Examples of categorical features include: Gender (male/female) ,Color (red, green, blue) ,Marital status (single, married, divorced)
numerical_features = [ "CRIM","ZN","INDUS","NOX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT","MEDV" ]
categorical_features = ["CHAS" ]
Q10. What is the purpose of normalizing or standardizing in numerical features? What is the difference between these two methods? Is there a need to do this in this project?¶
Let’s dive into the purpose of normalizing and standardizing numerical features and explore the differences between these two methods:
Purpose of Normalization and Standardization: Both normalization and standardization are techniques used to preprocess numerical features before feeding them into machine learning models.
The primary goals are:
Scaling: Bring all features to a similar scale to prevent certain features from dominating others during model training.
Stabilizing Algorithms: Some algorithms (like gradient descent) perform better when features are scaled.
Normalization (Min-Max Scaling):
Method: Normalize each feature to a specific range (usually [0, 1]).
Purpose: Useful when features have different ranges and you want them all within a consistent scale.
Preserves the original distribution of the data.
Example: Age (range: 0-100) and income (range: $20,000-$200,000) can be normalized to [0, 1].
Standardization (Z-Score Scaling)
Method: Standardize each feature to have a mean of 0 and a standard deviation of 1.
Purpose: Useful when features have different units or distributions.
Centers the data around zero.
Does not preserve the original distribution.
Example: Standardizing height (in cm) and weight (in kg) to have zero mean and unit variance.
When to Use Each Method:
Normalization: When the feature distribution is not necessarily Gaussian.
When you want to preserve the original data range.
Standardization: When the feature distribution is approximately Gaussian.
When you want to center the data around zero and have unit variance.
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# # df["MEDV"]
# temp = pd.DataFrame(scaler.fit_transform(df[numerical_features]), columns=numerical_features, index=df.index)
# df = pd.concat([temp , df[categorical_features] , df["MEDV"]] , axis = 1 )
Q11. What kind of preprocess is usefull for categorical features which are usually saved as strings or objects ?¶
Categorical features, which are typically represented as strings or objects, require preprocessing before they can be effectively used in machine learning models. Let’s explore some common techniques for handling categorical data:
Label Encoding: In label encoding, each unique category or label is assigned a unique numerical value. For example, if you have a “color” feature with labels “red,” “green,” and “blue,” you could encode them as 0, 1, and 2, respectively..
This method is suitable for ordinal data (where there is an inherent order among categories), but it may not be ideal for nominal data (where categories lack any intrinsic hierarchy).
Sklearn provides the LabelEncoder class for this purpose
One-Hot Encoding: One-hot encoding converts categorical data into binary vectors. Each category becomes a separate binary feature, and only one of these features is active (1) for each observation.
For example, if you have a “fruit” feature with labels “apple,” “banana,” and “orange,” one-hot encoding would create three binary features: “is_apple,” “is_banana,” and “is_orange.”
This method is commonly used for nominal data.
Sklearn provides the OneHotEncoder class for one-hot encoding
Ordinal Encoding Ordinal encoding is suitable for ordinal data. It assigns numeric values based on the order or ranking of categories.
For instance, if you have an “education level” feature with labels “high school,” “bachelor’s,” and “master’s,” you could encode them as 0, 1, and 2, respectively.
Frequency Encoding: In frequency encoding, each category is replaced with its frequency (count) in the dataset. This can be useful when certain categories are more common than others.
For example, if you have a “country” feature, you could encode each country with the number of occurrences in the dataset.
Target Encoding: Target encoding uses the target variable (the variable you want to predict) to encode categorical features.
For each category, the mean (or other aggregation) of the target variable is calculated. This aggregated value replaces the original category label.
Target encoding can be helpful when there is a relationship between the categorical feature and the target variable.
Q12. Research validation, train, test data and explain common segmentation methods.Then divide your data into these categories.¶
Let’s dive into the concepts of data segmentation and the common methods for dividing data into different subsets.
Data segmentation involves splitting a dataset into multiple subsets for various purposes, such as model training, validation, and testing. Each subset serves a specific role in the machine learning pipeline:
Training Data: The training dataset is used to train the machine learning model. It contains labeled examples (input features and corresponding target labels).
The model learns from this data during training to make accurate predictions.
Typically, the training data constitutes the largest portion of the dataset.
Validation Data: The validation dataset is used to tune hyperparameters and assess the model’s performance during training.
It helps prevent overfitting by providing an independent set of examples not used for training.
Commonly, a portion of the training data is set aside for validation.
Test Data: The test dataset evaluates the model’s performance after training and hyperparameter tuning.
It provides an unbiased estimate of how well the model generalizes to unseen data.
The test data should not be used during model development or hyperparameter tuning.
Common Segmentation Methods: Here are some common ways to divide data into these categories:
Hold-Out Method: Split the dataset into three parts: training, validation, and test sets.For example: 70% for training 15% for validation 15% for testing
K-Fold Cross-Validation: Divide the dataset into K equally sized folds (subsets).
Train the model K times, each time using K-1 folds for training and the remaining fold for validation.
Calculate the average performance across all K iterations.
Common choices for K: 5 or 10.
Stratified Sampling: Ensure that each subset (training, validation, test) maintains the same class distribution as the original dataset.
Useful when dealing with imbalanced classes.
4.Time-Based Splitting: For time-series data, split based on chronological order.
Use earlier data for training, intermediate data for validation, and the latest data for testing.
X_train, X_test, y_train, y_test = train_test_split(
df.drop(columns=["MEDV"]),
df['MEDV'],
test_size=0.2,
random_state= 885
)
Q13. Research other preprocessing methods.Mention some and use them if needed.¶
Preprocessing is a crucial step in preparing data for machine learning models. Let’s explore some common preprocessing techniques:
Standardization (Mean Removal and Variance Scaling):
Standardization ensures that features have zero mean and unit variance. It’s essential for many machine learning algorithms, especially those that assume normally distributed data.
The StandardScaler from scikit-learn is a convenient way to perform this operation. It computes the mean and standard deviation on a training set and applies the same transformation to the testing set
Data Cleaning:: Detecting and fixing bad or inaccurate observations in your dataset improves data quality. Removing duplicates, handling missing values, and correcting inconsistencies fall under data cleaning.
Dimensionality Reduction: Techniques like Principal Component Analysis (PCA) reduce the number of features while preserving most of the information. This helps prevent overfitting and speeds up training.
Feature Engineering: Create new features based on domain knowledge. For example, combining existing features, creating interaction terms, or extracting relevant information.
Handling Large Amounts of Data (Sampling): When dealing with large datasets, consider sampling techniques (e.g., random sampling) to work with a manageable subset.
Data Transformation: Convert data to a consistent structure. This includes encoding categorical variables, scaling numerical features, and handling skewed distributions.
3) Traning , Evaluation and Adjusment¶
Phase one : Linear Regression¶
Main form of simple linear regression function: $$f(x) = \alpha x + \beta$$
here we want to find the bias ($\alpha$) and slope($\beta$) by minimizing the derivation of the Residual Sum of Squares (RSS) function:
- step 1: Compute RSS of the training data
$$ RSS = \Sigma (y_i - (\hat{\beta} + \hat{\alpha} * x_i) )^2 $$
- step 2: Compute the derivatives of the RSS function in terms of $\alpha$ and $\beta$, and set them equal to 0 to find the desired parameters
$$ \frac{\partial RSS}{\partial \beta} = \Sigma (-f(x_i) + \hat{\beta} + \hat{\alpha} * x_i) = 0$$ $$ \to \beta = \hat{y} - \hat{\alpha} \hat{x} \to (1)$$
$$ \frac{\partial RSS}{\partial \alpha} = \Sigma (-2 x_i y_i + 2 \hat{\beta} x_i + 2\hat{\alpha} x_i ^ 2) = 0 \to (2)$$
$$ (1) , (2) \to \hat{\alpha} = \frac{\Sigma{(x_i - \hat{x})(y_i - \hat{y})}}{\Sigma{(x_i - \hat{x})^2}} $$ $$ \hat{\beta} = y - \hat{a} x$$
Based on the above formula, implement the function below to compute the parameters of a simple linear regression
def linear_regression(input, output):
sum_input = np.sum(input)
sum_output = np.sum(output)
mult_in_out = np.sum(input * output)
SSi = np.sum(input ** 2)
intercept = (mult_in_out - (len(input) * (sum_input / len(input)) * (sum_output / len(output)))) / (SSi - len(input) * ((sum_input / len(input)) ** 2))
slope = (sum_output / len(output)) - intercept * (sum_input / len(input))
print ("intercept: " , intercept)
print ("slope: " , slope)
return (intercept, slope)
Now complete this get_regression_predictions(...) function to predict the value of given data based on the calculated intercept and slope
def get_regression_predictions(input, intercept, slope):
predicted_values = [(intercept * x) + slope for x in input]
return (predicted_values)
Now that we have a model and can make predictions, let's evaluate our model using Root Mean Square Error (RMSE). RMSE is the square root of the mean of the squared differences between the residuals, and the residuals is just a fancy word for the difference between the predicted output and the true output.
Complete the following function to compute the RSME of a simple linear regression model given the input_feature, output, intercept and slope:
def get_root_mean_square_error(predicted_values, actual_values):
if len(predicted_values) != len(actual_values):
raise ValueError("The lengths of predicted and actual values must match.")
residuals = np.subtract(predicted_values, actual_values)
mean_squared_error = np.mean(np.square(residuals))
root_mean_square_error = np.sqrt(mean_squared_error)
return root_mean_square_error
intercept, slope = linear_regression(X_train['LSTAT'], y_train)
y_pred = get_regression_predictions(X_test['LSTAT'], intercept, slope)
intercept: -0.92191748370245 slope: 34.52255404370124
The RMSE has no bound, thus it becomes challenging to determine whether a particular RMSE value is considered good or bad without any reference point. Instead, we use R2 score. The R2 score is calculated by comparing the sum of the squared differences between the actual and predicted values of the dependent variable to the total sum of squared differences between the actual and mean values of the dependent variable. The R2 score is formulated as below:
$$R^2 = 1 - \frac{SSres}{SStot} = 1 - \frac{\sum_{i=1}^{n} (y_{i,true} - y_{i,pred})^2}{\sum_{i=1}^{n} (y_{i,true} - \bar{y}_{true})^2} $$
Complete the following function to calculate the R2 score of a given input_feature, output, bias, and slope:
def get_r2_score(predicted_values, actual_values):
if len(predicted_values) != len(actual_values):
raise ValueError("The lengths of predicted and actual values must match.")
mean_actual_values = sum(actual_values) / len(actual_values)
total_sum_of_squares = sum((y_i - mean_actual_values) ** 2 for y_i in actual_values)
# print("total_sum_of_squares",total_sum_of_squares)
residual_sum_of_squares = sum((y_i - y_hat) ** 2 for y_i, y_hat in zip(actual_values, predicted_values))
# print("residual_sum_of_squares",residual_sum_of_squares)
r_squared = 1 - (residual_sum_of_squares / total_sum_of_squares)
return r_squared
Now calculate the fitness of the model. Remember to provide explanation for the outputs in your code!
# max_score = 0
# max_i = 0
# max_type = ""
# for i in range (1500,3000):
for feature in X_train.columns:
print(feature + ":")
intercept, slope = linear_regression(X_train[feature], y_train)
y_pred = get_regression_predictions(X_test[feature], intercept, slope)
# print(f"y_pred: {y_pred:.2f}")
# print("type_pred",type(y_pred))
# print("type_test",type(y_test.to_list()))
# print(f"MEDV max: {max_:.2f}")
# print("y_test_tolist",y_test.to_list())
# if (get_r2_score(y_pred, y_test) > max_score):
# max_score = get_r2_score(y_pred, y_test)
# max_i = i
# max_type = feature
print ("R2 score: " , get_r2_score(y_pred, y_test))
print ("RMSE score: " , get_root_mean_square_error(y_pred, y_test))
print ("-------------------------------------------------------------")
X_ref = Y_ref = np.linspace(10, 60, 100)
plt.plot(X_ref, Y_ref, color='red', linewidth=1)
g=plt.scatter(y_test.to_list(), y_pred,color= "green")#colors[i//2 - 49
g.axes.set_ylabel('True Values ')
g.axes.set_xlabel('Predictions ')
plt.tight_layout()
g.axes.axis('equal')
g.axes.axis('square')
plt.show()
# print("max_score,max_i,max_type: ",max_score,max_i,max_type)
CRIM: intercept: -0.8457383032245622 slope: 25.004037946678405 R2 score: 0.1802976349077008 RMSE score: 6.754795496976585 ------------------------------------------------------------- ZN: intercept: 0.11124710845892874 slope: 22.5769882476327 R2 score: 0.2002746298096192 RMSE score: 6.6719771139360695 ------------------------------------------------------------- INDUS: intercept: -0.39334589397502623 slope: 27.504068576492276 R2 score: 0.27840451099637464 RMSE score: 6.337690342745086 ------------------------------------------------------------- CHAS: intercept: 2.0681793801289787 slope: 23.661793686435086 R2 score: -0.027302605489354814 RMSE score: 7.56194137140654 ------------------------------------------------------------- RM: intercept: 0.0020154259694475274 slope: 23.983392432358844 R2 score: -0.03003803409046424 RMSE score: 7.5720023793122975 ------------------------------------------------------------- AGE: intercept: -0.04260497445689591 slope: 26.48976919452121 R2 score: 0.156350519015387 RMSE score: 6.852753860200937 ------------------------------------------------------------- RAD: intercept: -0.0007002388862702945 slope: 24.06930125621563 R2 score: -0.03256784812231128 RMSE score: 7.58129524512511 ------------------------------------------------------------- TAX: intercept: -0.011507082068918966 slope: 27.838751658965375 R2 score: 0.14214290505937544 RMSE score: 6.910215396157035 ------------------------------------------------------------- PTRATIO: intercept: -0.0018079759128433028 slope: 24.09679458737494 R2 score: -0.032827061116955436 RMSE score: 7.582246779172046 ------------------------------------------------------------- B: intercept: 0.010545327736184133 slope: 20.42741231621585 R2 score: -0.014594449585674907 RMSE score: 7.515023659001627 ------------------------------------------------------------- LSTAT: intercept: -0.92191748370245 slope: 34.52255404370124 R2 score: 0.6068839208528527 RMSE score: 4.677830862998919 -------------------------------------------------------------
Q14. In this project we are implementing supervised machine learning models. What is the difference between these models and unsupervised, semi-supervised and reinforcement learning models? Give an example for each.¶
Let’s explore the differences between supervised, unsupervised, semi-supervised, and reinforcement learning models, along with examples for each:
Supervised Learning:
Definition: Supervised learning builds a model based on labeled data. In this approach, the algorithm learns from input-output pairs (features and corresponding target labels).
Input Data: All data points are labeled (each example has a known target label).
Training Process: External supervision guides the model during training.
Use: Supervised learning is commonly used for tasks like classification (e.g., spam detection, image recognition) and regression (e.g., predicting house prices).
Example Algorithms: Decision trees , Support Vector Machine (SVM) ,Linear regression
Example Use Case: Image recognition: Identifying objects in images based on labeled training data.
Unsupervised Learning:
Definition: Unsupervised learning builds a model based on unlabeled data. The algorithm discovers underlying patterns or structures without explicit target labels.
Input Data: All data points are unlabelled.
Training Process: No external supervision; the algorithm explores data relationships independently.
Use: Unsupervised learning is useful for clustering (grouping similar data points) and dimensionality reduction.
Example Algorithms: K-means clustering, Hierarchical clustering , Principal Component Analysis (PCA)
Example Use Case: Customer segmentation: Grouping customers based on purchasing behavior.
Semi-Supervised Learning:
Definition: Semi-supervised learning sits between supervised and unsupervised learning. It uses a mix of labeled and unlabeled data for model building.
Input Data: Partially labeled (some examples have known labels, while others don’t).
Training Process: Combines external supervision (from labeled data) and self-discovery (from unlabeled data).
Use: Semi-supervised learning is helpful when labeled data is scarce but unlabeled data is abundant.
Example Algorithms: Generative adversarial networks (GANs), Self-trained Naïve Bayes classifier
Example Use Case: Text document classification: Using a small labeled dataset and a larger unlabeled dataset to improve accuracy.
Reinforcement Learning:
Definition: Reinforcement learning is feedback-based. It trains an algorithm using a system of rewards and punishments for correct and incorrect actions.
Input Data: No predefined data; the learning agent interacts with an environment.
Training Process: The goal is to maximize rewards by learning optimal actions.
Use Cases: Playing Games (e.g., Chess): The AI agent learns to make moves that lead to higher rewards. Self-Driving Cars: Navigating traffic based on real-time feedback.
Example Algorithms: Q-learning, Deep Q Network (DQN), Policy optimization
Q15. What is regression and what are the differences with classification methods?¶
Let’s dive into regression and classification and explore their differences:
Regression:
Definition: Regression is a type of supervised machine learning algorithm used to predict a continuous numerical value based on input features.
Use Case: It’s commonly used for tasks like predicting stock prices, estimating house prices, or forecasting sales.
Example: Suppose we have a dataset with information about houses, including square footage and number of bathrooms. We can build a regression model to predict the selling price of a house based on these features. The response variable (selling price) is continuous.\
Evaluation Metric: The most common metric for evaluating regression models is the Root Mean Square Error (RMSE), which measures how far predicted values are from observed values.
Classification:
Definition: Classification is another type of supervised machine learning algorithm that assigns input data to predefined categories or classes
Use Case: It’s used for tasks like spam detection (classifying emails as spam or not), medical diagnosis (categorizing diseases), or sentiment analysis (positive/negative sentiment).
Example: Consider a dataset of college basketball players. We can build a classification model to predict whether a player will be drafted into the NBA based on features like average points per game and division level. The response variable (“drafted” or “not drafted”) is categorical.
Evaluation Metric: The accuracy of a classification model is typically measured by the percentage of correct classifications it makes.
In summary:
Regression predicts continuous values.
Classification assigns data to predefined classes. The evaluation metrics differ between the two types of models
Q16.Briefly explain the relationships presented regarding the linear regression method.¶
In simple linear regression, we have a model of the form:
$[ f(x) = \alpha x + \beta ]$
Here, $(\alpha)$ represents the slope of the line (how much $( y )$ changes for a change in $( x )$), and $(\beta)$ represents the y-intercept (the value of $( y )$ when $( x = 0 )$).
The goal of linear regression is to find the values of $(\alpha)$ and $(\beta)$ that best fit the data. "Best fit" typically means that the sum of the squared differences between the observed values ($( y_i )$) and the values predicted by our model ($( f(x_i) = \hat{\beta} + \hat{\alpha} x_i )$) is minimized. This sum of squared differences is known as the Residual Sum of Squares (RSS):
$[ RSS = \Sigma (y_i - (\hat{\beta} + \hat{\alpha} x_i))^2 ]$
To minimize the RSS, the partial derivatives of RSS with respect to $(\alpha)$ and $(\beta)$ are set to zero. The equations obtained from these steps provide us with the least squares estimates for $(\alpha)$ and $(\beta)$.
Step 1: Compute RSS of the Training Data
This equation represents the sum of the squared residuals, which we are trying to minimize through selection of appropriate $(\alpha)$ and $(\beta)$ values.
Step 2: Compute the Derivatives of the RSS Function in Terms of $(\alpha)$ and $(\beta)$
Setting each partial derivative equal to zero provides the minimum point of RSS, assuming a convex loss surface.
First equation:
$[ \frac{\partial RSS}{\partial \beta} = \Sigma (-f(x_i) + \hat{\beta} + \hat{\alpha} x_i) = 0 ]$
By solving this equation, we get the expression for (\beta):
$[ \hat{\beta} = \hat{y} - \hat{\alpha} \hat{x} ]$
Here, $( \hat{x} )$ is the mean of all $( x )$ values and $( \hat{y} )$ is the mean of all $( y )$ values.
Second equation:
$[ \frac{\partial RSS}{\partial \alpha} = \Sigma (-2 x_i y_i + 2 \hat{beta} x_i + 2 \hat{alpha} x_i^2) = 0 ]$
By solving the above with the first equation, we get the expression for $(\alpha)$:
$[ \hat{\alpha} = \frac{\Sigma (x_i - \hat{x})(y_i - \hat{y})}{\Sigma (x_i - \hat{x})^2} ]$
This represents the slope ($(\alpha)$) as the ratio of the covariance of $( x )$ and $( y ) $to the variance of $( x )$, giving the average change in $( y )$ per unit change in $( x )$.
Finally, using $(\alpha)$ from the second equation into the first, we get $(\beta)$:
$[ \hat{\beta} = \hat{y} - \hat{\alpha} \hat{x} ]$
This $(\beta)$ value ensures that our line passes through the centroid ($( \hat{x}, \hat{y} )$) of the data, providing the best balance between all points.
Essentially, these steps are used to arrive at the least squares estimates for the regression line, which is the line that minimizes the sum of the squared residuals (differences) between the predicted values and the actual values of the target variable.
Q17. Which feature do you think compared to other features give us a more accurate output? Explain the reason for your choice.¶
As seen above,the correlation between MEDV and RM is higher than others so it may give us a more accurate output.
Q18.Read about RSS, MSE, RMSE and 2R score methods and explain each one in your report.¶
Let’s delve into each of these regression evaluation metrics:
Mean Squared Error (MSE):
- Definition: MSE measures the average squared difference between the estimated values and the actual value. It is a way to quantify the error of a model's predictions.
- Formula: $[ MSE = \frac{1}{n} \Sigma_{i=1}^{n} (y_i - \hat{y_i})^2 ]$
- Explanation: For each point, you calculate the square of the difference between the prediction $((\hat{y_i})$) and the actual value $((y_i)$), and then average those values.
Root Mean Squared Error (RMSE):
- Definition: RMSE is the square root of the mean squared error. It measures how well a regression model predicts the outcome of interest.
- Formula: $[ RMSE = \sqrt{MSE} = \sqrt{\frac{1}{n} \Sigma_{i=1}^{n} (y_i - \hat{y_i})^2} ]$
- Explanation: RMSE is simply the square root of MSE and provides error terms in the same units as the response variable, which can be useful for interpretation.
Residual Sum of Squares (RSS):
- Definition: RSS is the sum of the squared differences between the observed dependent variable and the value predicted by the model.
- Formula: $[ RSS = \Sigma_{i=1}^{n} (y_i - \hat{y_i})^2 ]$
- Explanation: Instead of averaging the square residuals (as in MSE), in RSS, you sum them up. This gives you a total measure of the model's error.
R-squared (R²) Score:
- Definition: R² is the proportion of variance in the dependent variable that can be predicted from the independent variable(s). It is a statistic that will give some information about the goodness of fit of a model.
- Formula: $4[ R^2 = 1 - \frac{RSS}{TSS} = 1 - \frac{\Sigma (y_i - \hat{y_i})^2}{\Sigma (y_i - \bar{y})^2} ]$
- Explanation: TSS or Total Sum of Squares is the total variance in the response variable. R² compares the fit of the chosen model with that of a horizontal straight line (mean of the observed data). An R² of 1 indicates that the regression predictions perfectly fit the data, while an R² of 0 indicates no linear relationship between the dependent and independent variables.
Each of these metrics provides different information:
- MSE and RMSE measure the average error magnitude and can be used to compare different models. Since these are both error metrics, lower values imply a better fit.
- RMSE is more sensitive to outliers than MSE since the errors are squared before taking the mean, thus highlighting larger errors.
- RSS provides a single sum of error magnitude, useful for optimization purposes such as in gradient descent.
- R² is a relative measure of fit; it tells us how much of the variability in the response data can be explained by the model. An R² score will normally be between 0 and 1, and a higher score indicates a better fit.
It's worth noting that while a higher R² is generally better, it is not a definitive measure of model quality. For example, a high R² value does not indicate that the model has the correct regression function or that it will make good predictions on new data. Additionally, in models with a large number of predictors, R² will tend to overstate the model's effectiveness because it will always increase as more predictors are added regardless of their relevance. This is why it is sometimes better to look at the adjusted R² which penalizes more complex models, or to use other information criteria like AIC or BIC for model selection.
Q19. Using the RMSE method and 2R score, evaluate the predicted values.Do the above operations on a few other features as well. What do you infer from the obtained values?¶
Done above
Q20. Compare the predicted values with the actual values using a scatter plot, where the x-axis shows the actual values and the y-axis shows the predicted values. Also draw the line y = x.¶
Done above.
Phase Three : Classification¶
Q21. What is the concept of pruning in decision trees? Mention the advantages and disadvantages of using this method¶
Let’s explore the concept of pruning in decision trees and discuss its advantages and disadvantages.
Decision Tree Pruning: Pruning is a technique used to optimize decision tree models by preventing overfitting and improving their generalization ability. When a decision tree becomes too complex (i.e., too deep or with too many branches), it can fit the training data perfectly but perform poorly on new, unseen data. Pruning helps simplify the tree by removing unnecessary branches or nodes, resulting in a smaller, more interpretable model.
There are two main types of decision tree pruning:
Pre-Pruning (Early Stopping): Pre-pruning involves stopping the growth of the decision tree before it becomes too complex.
Common pre-pruning techniques include:
Maximum Depth: Limiting the maximum depth (number of levels) of the tree.
Minimum Samples: per Leaf: Setting a minimum threshold for the number of samples in each leaf node.
Minimum Samples: per Split: Specifying the minimal number of samples needed to split a node.
Maximum Features: Restricting the features considered for splitting.
By pruning early, we obtain a simpler tree that is less likely to overfit the training data.
Post-Pruning (Reducing Nodes): Post-pruning occurs after the tree is fully grown.
Common post-pruning techniques include:
Cost-Complexity Pruning (CCP): Assigning a cost to each subtree based on accuracy and complexity, then selecting the subtree with the lowest cost.
Reduced Error Pruning: Removing branches that do not significantly affect overall accuracy
Minimum Impurity Decrease: Pruning nodes if the decrease in impurity (e.g., Gini impurity or entropy) is below a certain threshold.
Minimum Leaf Size: Removing leaf nodes with fewer samples than a specified threshold.
Post-pruning simplifies the tree while preserving its accuracy.
Advantages of Decision Tree Pruning:
Reduced Overfitting: Pruning prevents decision trees from becoming overly complex, reducing the risk of overfitting the training data.
Improved Generalization: A pruned tree generalizes better to unseen data, making it more reliable for predictions.
Interpretability: Smaller trees are easier to interpret and explain to stakeholders.
Disadvantages of Decision Tree Pruning:
Loss of Information: Pruning may remove relevant features or patterns from the tree, leading to some loss of predictive power.
Tuning Complexity: Selecting optimal pruning parameters (e.g., minimum samples per leaf) requires tuning and experimentation.
Trade-Offs: Pruning involves a trade-off between model simplicity and accuracy.
In summary, decision tree pruning helps improve performance, interpretability, and generalization by simplifying the tree structure. Proper pruning leads to more robust models that strike a balance between complexity and accuracy
Q22. When can the use of decision trees have an advantage over other models?¶
Decision trees have several advantages over other models in specific scenarios. Let’s explore when decision trees can be advantageous:
Interpretability:
Decision trees are highly interpretable. The structure of a decision tree is easy to visualize and understand.
When you need a model that provides clear insights into feature importance and decision-making, decision trees are a great choice.
Nonlinear Relationships:
- Decision trees can capture nonlinear relationships between features and the target variable.
- When your data exhibits complex interactions or nonlinear patterns, decision trees can outperform linear models.
Mixed Data Types:
- Decision trees handle both categorical and numerical features naturally.
- When your dataset contains a mix of feature types (e.g., text, numeric, categorical), decision trees simplify feature engineering.
Robustness to Outliers:
- Decision trees are less sensitive to outliers compared to linear regression.
- When your data contains extreme values, decision trees can provide more robust predictions.
Handling Missing Values:
- Decision trees can handle missing values by splitting nodes based on available features.
- When dealing with datasets with missing data, decision trees offer flexibility.\
Ensemble Methods:
- Decision trees serve as building blocks for ensemble methods like Random Forests and Gradient Boosting.
- When you want to improve model performance by combining multiple decision trees, ensemble methods are powerful.
Feature Interaction Detection:
- Decision trees naturally identify feature interactions.
- When you suspect that interactions between features significantly impact the target variable, decision trees can reveal these relationships.
However, it’s essential to consider the limitations of decision trees:
Overfitting: Decision trees can overfit the training data if not pruned properly. Techniques like pre-pruning and post-pruning are crucial to prevent overfitting.
Bias: Decision trees can be biased toward features with more levels or higher cardinality.
Instability: Small changes in the data can lead to different tree structures, making decision trees less stable.
In summary, use decision trees when interpretability, nonlinear relationships, mixed data types, and robustness to outliers are essential.
Q23. What is the inherent difference between KNN classification and other classification methods such as neural networks or Logistic regression? (Pay attention to how each classifier is trained.)¶
Let’s explore the inherent differences between K-Nearest Neighbors (KNN), Logistic Regression, and Neural Networks in terms of training and their underlying principles:
K-Nearest Neighbors (KNN):
Training Approach: KNN is a non-parametric algorithm, meaning it doesn’t make any assumptions about the underlying data distribution. It memorizes the entire training dataset.
Instance-Based Learning: KNN operates based on the principle of feature similarity. When a new data point is introduced, it looks at the ‘k’ closest data points (neighbors) in the training set. The algorithm calculates the distance between data points (using metrics like Euclidean or Manhattan distance) and assigns the new point to the most common class among these neighbors (for classification) or predicts a value based on the average of its nearest neighbors (for regression).
Advantages:
Simple and versatile.
Effective in capturing non-linear relationships.
No explicit model training.
Disadvantages:
Computationally expensive for large datasets.
Sensitive to noise and outliers.
Choice of ‘k’ impacts performance.
Logistic Regression:
Training Approach: Logistic Regression is a parametric algorithm. It assumes a specific functional form (the logistic function) for the relationship between input features and the output (probability of occurrence of an event).
Probability Modeling: Logistic Regression transforms linear combinations of input features into a probability format ranging between 0 and 1. It predicts the probability of belonging to a specific class (binary or multinomial).
Advantages:
Efficient and interpretable.
Suitable for linear or logistic relationships.
Provides probabilities.
Disadvantages:
Assumes linearity (may not capture complex non-linear patterns).
Sensitive to outliers.
Limited expressiveness compared to neural networks.
Neural Networks (NNs):
Training Approach: Neural networks are a class of deep learning models. They consist of interconnected layers of artificial neurons (nodes). Training involves adjusting the weights (parameters) of these connections using optimization techniques (e.g., gradient descent).
Complexity and Flexibility: NNs can model highly complex relationships, including non-linear ones. They learn hierarchical features from raw data.
Advantages:
High expressiveness due to hidden layers.
Can approximate any function (universal approximation theorem).
Suitable for large-scale problems.
Disadvantages:
Requires large amounts of data for training.
Prone to overfitting (regularization techniques needed).
Black-box nature (less interpretable).
In summary:
KNN is chosen for simplicity and effectiveness in capturing non-linear relationships.
Logistic Regression is preferred for efficiency and interpretability, especially when the relationship between predictors and the response is linear or logistic.
Neural Networks excel in modeling complex relationships but require substantial data and come with interpretability challenges
Q24. Research the one nearest neighbor algorithm and mention its advantages and disadvantages¶
Let’s explore the One Nearest Neighbor (1-NN) algorithm and discuss its advantages and disadvantages:
One Nearest Neighbor (1-NN) Algorithm The 1-NN algorithm is a variant of the K-Nearest Neighbors (KNN) algorithm, where K is set to 1. In other words, it considers only the single nearest neighbor to a given data point when making predictions. Here are the key points about 1-NN:
Working Principle:
Given a dataset with labeled instances (training data), the 1-NN algorithm identifies the closest data point (neighbor) to a new, unclassified data point.
The class or value of the new point is determined by the class or value of its nearest neighbor.
Advantages of 1-NN:
Robust to Noisy Data:
1-NN is robust to noisy training data, especially if we use an inverse square of weighted distance as the distance metric1.
It relies on the closest neighbor, which tends to be less affected by outliers.
Effective for Large Training Sets:
Despite its simplicity, 1-NN can handle large training datasets effectively.
It doesn’t require extensive model training or parameter tuning.
Disadvantages of 1-NN:
Computational Cost:
Determining the nearest neighbor for each query point can be computationally expensive, especially when dealing with large datasets.
The algorithm needs to calculate distances between the query point and all training points.
Memory Requirements:
- Processing large datasets requires substantial memory, as the algorithm must store the entire training dataset.
Choosing the Right Value of K:
In 1-NN, there’s no parameter (K) to tune (since (K = 1)), but choosing the right distance metric (e.g., Euclidean distance) is crucial.
However, in more general KNN (where (K > 1)), selecting an appropriate (K) value becomes important.
In summary, the 1-NN algorithm is simple, robust, and useful for specific scenarios. However, its computational cost and memory requirements can be limiting factors, especially when working with large datasets
Q25. Research about other methods of distance measurement in KNN algorithm and describe some of them¶
Let’s explore different distance measurement techniques commonly used in the K-Nearest Neighbors (KNN) algorithm. These techniques play a crucial role in determining the proximity of data points. Here are some of the commonly used distance metrics:
Euclidean Distance:
The Euclidean distance is the most widely used metric in KNN.
It measures the straight-line distance between two points in Euclidean space (like measuring the length of a rope).
For two vectors (X) and (Y) the Euclidean distance is calculated as: sqrt (sum ((X_i - Y_i)^2)).
Euclidean distance works well when features have similar scales.
Manhattan Distance (Taxicab Distance):
The Manhattan distance (also known as the taxicab distance) measures the distance along the grid lines (like moving through city blocks).
It is calculated as the sum of absolute differences between corresponding coordinates: Manhattan Distance = | x 1 − x 2 | + | y 1 − y 2 | . An analogous relationship can be defined in a higher-dimensional space.
Manhattan distance is robust to outliers and works well when features have different scales.
Minkowski Distance:
The Minkowski distance generalizes both Euclidean and Manhattan distances.
It is defined as: Let us consider a 2-dimensional space having three points P1 (X1, Y1), P2 (X2, Y2), and P3 (X3, Y3), the Minkowski distance is given by ( |X1 – Y1|^p + |X2 – Y2|^p + |X2 – Y2|^p )^(1/p)
When (p = 2), it reduces to Euclidean distance, and when (p = 1), it becomes Manhattan distance.
The choice of (p) determines the sensitivity to different feature dimensions.
Hamming Distance:
The Hamming distance is used for categorical data (e.g., binary features or nominal attributes).
It counts the number of positions at which corresponding elements are different.
For binary vectors, it’s simply the number of differing bits.
For example, if (X = (0, 1, 1, 0)) and (Y = (1, 1, 0, 1)), the Hamming distance is 3 (three differing positions).
These distance metrics help KNN determine the similarity or dissimilarity between data points. Depending on the nature of your data (continuous, categorical, or mixed), you can choose an appropriate distance measure
Q26. Show one of the optimized models that you achieved by trial and error in your report.¶
a = pd.qcut(df['MEDV'], q = 10, labels = False)
df = df.assign(LUXURIOS=a)
df
| CRIM | ZN | INDUS | CHAS | RM | AGE | RAD | TAX | PTRATIO | B | LSTAT | MEDV | LUXURIOS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.00632 | 18.0 | 2.31 | 0.0 | 6.575 | 65.2 | 1 | 296.0 | 15.3 | 396.900 | 4.98 | 24.0 | 6 |
| 1 | 0.02731 | 0.0 | 7.07 | 0.0 | 6.421 | 78.9 | 2 | 242.0 | 17.8 | 396.900 | 9.14 | 21.6 | 4 |
| 2 | 0.02729 | 0.0 | 7.07 | 0.0 | 7.185 | 61.1 | 2 | 242.0 | 17.8 | 392.830 | 4.03 | 34.7 | 8 |
| 3 | 0.03237 | 0.0 | 2.18 | 0.0 | 6.998 | 45.8 | 3 | 222.0 | 18.7 | 390.885 | 2.94 | 33.4 | 8 |
| 4 | 0.06905 | 0.0 | 2.18 | 0.0 | 7.147 | 54.2 | 3 | 222.0 | 18.7 | 396.900 | 5.33 | 36.2 | 9 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 501 | 0.06263 | 0.0 | 11.93 | 0.0 | 6.593 | 69.1 | 1 | 273.0 | 21.0 | 391.990 | 9.67 | 22.4 | 4 |
| 502 | 0.04527 | 0.0 | 11.93 | 0.0 | 6.120 | 76.7 | 1 | 273.0 | 21.0 | 396.900 | 9.08 | 20.6 | 3 |
| 503 | 0.06076 | 0.0 | 11.93 | 0.0 | 6.976 | 91.0 | 1 | 273.0 | 21.0 | 396.900 | 5.64 | 23.9 | 6 |
| 504 | 0.10959 | 0.0 | 11.93 | 0.0 | 6.794 | 89.3 | 1 | 273.0 | 21.0 | 393.450 | 6.48 | 22.0 | 4 |
| 505 | 0.04741 | 0.0 | 11.93 | 0.0 | 6.030 | 80.8 | 1 | 273.0 | 21.0 | 396.900 | 7.88 | 11.9 | 0 |
506 rows × 13 columns
df.loc[df['LUXURIOS'] > 7 ,'LUXURIOS'] = 30
df.loc[df["LUXURIOS"] < 2 ,"LUXURIOS"] = 20
df.loc[df['LUXURIOS'] <= 7 ,'LUXURIOS'] = 25
df.loc[df['LUXURIOS'] == 30 ,'LUXURIOS'] = '20'
df.loc[df["LUXURIOS"] == 20 ,"LUXURIOS"] = '0'
df.loc[df['LUXURIOS'] == 25 ,'LUXURIOS'] = '10'
df
| CRIM | ZN | INDUS | CHAS | RM | AGE | RAD | TAX | PTRATIO | B | LSTAT | MEDV | LUXURIOS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.00632 | 18.0 | 2.31 | 0.0 | 6.575 | 65.2 | 1 | 296.0 | 15.3 | 396.900 | 4.98 | 24.0 | 10 |
| 1 | 0.02731 | 0.0 | 7.07 | 0.0 | 6.421 | 78.9 | 2 | 242.0 | 17.8 | 396.900 | 9.14 | 21.6 | 10 |
| 2 | 0.02729 | 0.0 | 7.07 | 0.0 | 7.185 | 61.1 | 2 | 242.0 | 17.8 | 392.830 | 4.03 | 34.7 | 20 |
| 3 | 0.03237 | 0.0 | 2.18 | 0.0 | 6.998 | 45.8 | 3 | 222.0 | 18.7 | 390.885 | 2.94 | 33.4 | 20 |
| 4 | 0.06905 | 0.0 | 2.18 | 0.0 | 7.147 | 54.2 | 3 | 222.0 | 18.7 | 396.900 | 5.33 | 36.2 | 20 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 501 | 0.06263 | 0.0 | 11.93 | 0.0 | 6.593 | 69.1 | 1 | 273.0 | 21.0 | 391.990 | 9.67 | 22.4 | 10 |
| 502 | 0.04527 | 0.0 | 11.93 | 0.0 | 6.120 | 76.7 | 1 | 273.0 | 21.0 | 396.900 | 9.08 | 20.6 | 10 |
| 503 | 0.06076 | 0.0 | 11.93 | 0.0 | 6.976 | 91.0 | 1 | 273.0 | 21.0 | 396.900 | 5.64 | 23.9 | 10 |
| 504 | 0.10959 | 0.0 | 11.93 | 0.0 | 6.794 | 89.3 | 1 | 273.0 | 21.0 | 393.450 | 6.48 | 22.0 | 10 |
| 505 | 0.04741 | 0.0 | 11.93 | 0.0 | 6.030 | 80.8 | 1 | 273.0 | 21.0 | 396.900 | 7.88 | 11.9 | 0 |
506 rows × 13 columns
def make_confusion_matrix(real_labels , pred_labels):
cm = confusion_matrix(real_labels, pred_labels)
sns.set(font_scale=1.4)
plt.figure(figsize=(5, 3))
sns.heatmap(cm, annot=True, fmt="d", cmap = "RdYlGn", cbar=False)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()
return cm
df.drop(columns=["ZN","LUXURIOS","RAD"])
| CRIM | INDUS | CHAS | RM | AGE | TAX | PTRATIO | B | LSTAT | MEDV | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.00632 | 2.31 | 0.0 | 6.575 | 65.2 | 296.0 | 15.3 | 396.900 | 4.98 | 24.0 |
| 1 | 0.02731 | 7.07 | 0.0 | 6.421 | 78.9 | 242.0 | 17.8 | 396.900 | 9.14 | 21.6 |
| 2 | 0.02729 | 7.07 | 0.0 | 7.185 | 61.1 | 242.0 | 17.8 | 392.830 | 4.03 | 34.7 |
| 3 | 0.03237 | 2.18 | 0.0 | 6.998 | 45.8 | 222.0 | 18.7 | 390.885 | 2.94 | 33.4 |
| 4 | 0.06905 | 2.18 | 0.0 | 7.147 | 54.2 | 222.0 | 18.7 | 396.900 | 5.33 | 36.2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 501 | 0.06263 | 11.93 | 0.0 | 6.593 | 69.1 | 273.0 | 21.0 | 391.990 | 9.67 | 22.4 |
| 502 | 0.04527 | 11.93 | 0.0 | 6.120 | 76.7 | 273.0 | 21.0 | 396.900 | 9.08 | 20.6 |
| 503 | 0.06076 | 11.93 | 0.0 | 6.976 | 91.0 | 273.0 | 21.0 | 396.900 | 5.64 | 23.9 |
| 504 | 0.10959 | 11.93 | 0.0 | 6.794 | 89.3 | 273.0 | 21.0 | 393.450 | 6.48 | 22.0 |
| 505 | 0.04741 | 11.93 | 0.0 | 6.030 | 80.8 | 273.0 | 21.0 | 396.900 | 7.88 | 11.9 |
506 rows × 10 columns
X_train, X_test, y_train, y_test = train_test_split(
df.drop(columns=["LUXURIOS","CHAS"]),
df['LUXURIOS'],
test_size=0.2,
random_state=84
)
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
decisionTree¶
decisionTree = DecisionTreeClassifier(
criterion='entropy',
#splitter='random',
max_depth=3,
min_samples_split=4,
min_samples_leaf=4
# random_state=42
)
decisionTree.fit(X_train, y_train_encoded)
preds_DT = decisionTree.predict(X_test)
(label_encoder.fit_transform(y_test))
array([1, 1, 1, 0, 2, 2, 2, 2, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
1, 1, 1, 1, 1, 0, 0, 1, 1, 2, 0, 1, 1, 1, 1, 1, 0, 1, 1, 2, 1, 2,
2, 0, 2, 1, 1, 1, 1, 0, 2, 1, 2, 2, 2, 0, 0, 2, 1, 0, 1, 0, 2, 1,
1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 0, 1, 1, 1, 2, 1, 1, 1, 1,
2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 1, 1, 2])
preds_DT
array([1, 1, 1, 0, 2, 2, 2, 2, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
1, 1, 1, 1, 1, 0, 0, 1, 1, 2, 0, 1, 1, 1, 1, 1, 0, 1, 1, 2, 1, 2,
2, 0, 2, 1, 1, 1, 1, 0, 2, 1, 2, 2, 2, 0, 0, 2, 1, 0, 1, 0, 2, 1,
1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 0, 0, 1, 1, 1, 2, 1, 1, 1, 1,
2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 1, 1, 2])
model = RandomForestClassifier()
kf = KFold(n_splits=5, random_state=42, shuffle=True)
X = df.drop('LUXURIOS', axis=1)
y = df['LUXURIOS']
scores = cross_val_score(model, X, y, cv=kf)
print(scores)
print(scores.mean())
[0.99019608 1. 1. 1. 1. ] 0.9980392156862745
LinearRegression¶
model = LinearRegression()
kf = KFold(n_splits=6, random_state=42, shuffle=True)
X = df.drop('RM', axis=1)
y = df['RM']
scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kf)
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
print(rmse_scores)
print(rmse_scores.mean())
[1.62702058 3.78970351 1.71140846 2.07788204 5.77197843 2.8576319 ] 2.972604152853512
cm_DT = make_confusion_matrix(label_encoder.fit_transform(y_test) , preds_DT)
def scoring(cm):
col_sum = [cm[0][i] + cm[1][i] for i in range(2)]
precision = [cm[i][i]/sum(cm[i]) for i in range(0,2)]
recalls = [cm[i][i] / col_sum[i] for i in range(2)]
F1_score = [(2*precision[i]*recalls[i])/ (precision[i] + recalls[i]) for i in range(2)]
accuracy = sum(cm[i][i] for i in range(2))/sum(sum(cm))
plt.clf()
plt.figure(figsize=(5, 3))
plt.scatter([0,1], precision)
plt.scatter([0,1], recalls)
plt.scatter([0,1], F1_score)
plt.grid()
plt.xlabel("class")
plt.ylabel("metric")
blue_patch = mpatches.Patch(color='blue',label='precision')
orange_patch = mpatches.Patch(color='orange',label='F1_score')
green_patch = mpatches.Patch(color='green',label='recalls')
plt.legend(handles=[blue_patch,orange_patch,green_patch])
for i in range(2):
plt.text(i, precision[i], f'{precision[i]:.2f}', ha='center', va='bottom')
plt.text(i, recalls[i], f'{recalls[i]:.2f}', ha='center', va='bottom')
plt.text(i, F1_score[i], f'{F1_score[i]:.2f}', ha='center', va='bottom')
plt.xticks([0, 1])
plt.show()
print(f"Precision: {precision[0]:.2f}")
print(f"Recall: {recalls[0]:.2f}")
print(f"F1 Score: {F1_score[0]:.2f}")
print()
weighted_average = sum(F1_score)/2
micro_average = (sum(cm[i][i] for i in range(2)) / sum(sum(cm))) + (sum(cm[i][i] for i in range(2)) / sum(sum(cm))) / 2
macro_average = sum(F1_score)/2
print("micro average: " + str(micro_average))
print("weighted_average: " + str(weighted_average))
print("macro average: " + str(macro_average))
print("accuract: " + str(accuracy))
scoring(cm_DT)
<Figure size 640x480 with 0 Axes>
Precision: 1.00 Recall: 0.94 F1 Score: 0.97 micro average: 1.1470588235294117 weighted_average: 0.9808484848484849 macro average: 0.9808484848484849 accuract: 0.7647058823529411
KNN¶
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5 , metric = "manhattan" , weights = "distance")
knn.fit(X_train.to_numpy(), y_train_encoded)
KNeighborsClassifier(metric='manhattan', weights='distance')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier(metric='manhattan', weights='distance')
KNN_preds = knn.predict(X_test.to_numpy())
cm_KNN = make_confusion_matrix(label_encoder.fit_transform(y_test) , KNN_preds)
scoring(cm_KNN)
<Figure size 640x480 with 0 Axes>
Precision: 0.69 Recall: 0.69 F1 Score: 0.69 micro average: 0.9852941176470589 weighted_average: 0.7953629032258064 macro average: 0.7953629032258064 accuract: 0.6568627450980392
logistioc_regression¶
log_reg = LogisticRegression(
penalty='l2',
C=1.0,
solver='lbfgs',
max_iter=100,
multi_class='auto',
class_weight='balanced',
)
log_reg.fit(X_train, y_train_encoded)
LogisticRegression(class_weight='balanced')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(class_weight='balanced')
Log_preds = log_reg.predict(X_test)
Log_cm = make_confusion_matrix(label_encoder.fit_transform(y_test) , Log_preds)
scoring(Log_cm)
<Figure size 640x480 with 0 Axes>
Precision: 0.94 Recall: 0.79 F1 Score: 0.86 micro average: 1.0441176470588234 weighted_average: 0.8952380952380953 macro average: 0.8952380952380953 accuract: 0.696078431372549
param_grid = {
'penalty': ['l1', 'l2', 'elasticnet', 'none'],
'C': np.logspace(-4, 4, 20),
'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
'max_iter': [50, 100, 200],
'class_weight': [None, 'balanced'],
'l1_ratio': np.linspace(0, 1, 10)
}
logreg_cv = GridSearchCV( LogisticRegression(), param_grid, cv=5, verbose=1, n_jobs=-1) # Using all CPU cores with n_jobs=-1
logreg_cv.fit(X_train, y_train_encoded)
print("Best parameters found: ", logreg_cv.best_params_)
Fitting 5 folds for each of 24000 candidates, totalling 120000 fits
Best parameters found: {'C': 0.615848211066026, 'class_weight': None, 'l1_ratio': 0.0, 'max_iter': 50, 'penalty': 'l2', 'solver': 'newton-cg'}
Log_preds_cv = log_reg.predict(X_test)
Log_cm_cv = make_confusion_matrix(label_encoder.fit_transform(y_test) , Log_preds_cv)
scoring(Log_cm_cv)
<Figure size 640x480 with 0 Axes>
Precision: 0.94 Recall: 0.79 F1 Score: 0.86 micro average: 1.0441176470588234 weighted_average: 0.8952380952380953 macro average: 0.8952380952380953 accuract: 0.696078431372549
Q27.For both of these models, obtain the optimal values for the parameters with the help of function GridSearchCV . Briefly explain how this function works. and compare the obtained results with the results obtained from the models whose parameters were obtained by trial and error.¶
GridSearchCV is a powerful tool for hyperparameter tuning in machine learning. Let me explain how it works:
What is GridSearchCV?
GridSearchCV automates the process of hyperparameter tuning by exhaustively searching through a predefined grid of parameter combinations.
It evaluates each combination using cross-validation (usually K-fold cross-validation) and provides us with the best set of parameters that maximize the model’s performance.
How does GridSearchCV work?
We pass predefined values for hyperparameters to the GridSearchCV function.
These hyperparameters are specified in a dictionary (or grid) where each hyperparameter is associated with a list of possible values.
GridSearchCV tries all combinations of the values in the grid and evaluates the model for each combination using cross-validation.
After evaluating all combinations, it selects the hyperparameters that result in the best performance (e.g., highest accuracy or lowest loss).
Arguments of GridSearchCV:
estimator: The machine learning model (estimator) for which we want to find the best hyperparameters.
param_grid: The dictionary specifying the hyperparameters and their possible values.
scoring: The evaluation metric (e.g., accuracy, F1-score) to optimize.
cv: The cross-validation strategy (e.g., K-fold cross-validation).
n_jobs: The number of CPU cores to use for parallel computation (optional).
In summary, GridSearchCV helps us systematically explore hyperparameter space and find the optimal combination of hyperparameters for our model.
DecisionTreeGrid¶
DecisionTreeGridSearch = GridSearchCV(
estimator=DecisionTreeClassifier(),
param_grid={
"criterion": ["gini", "entropy"],
"splitter": ["best", "random"],
"max_depth": range(2, 20),
"min_samples_split": range(2, 20),
"min_samples_leaf": range(2, 20),
"random_state": [84],
},
scoring="accuracy",
cv= 4,
n_jobs=-1,
)
DecisionTreeGridSearch.fit(X_train, y_train_encoded)
print(f"Best Parameters are : {DecisionTreeGridSearch.best_params_}")
Best Parameters are : {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 2, 'min_samples_split': 2, 'random_state': 84, 'splitter': 'best'}
preds_DT_Grid = DecisionTreeGridSearch.predict(X_test)
cm_DT_Grid = make_confusion_matrix(label_encoder.fit_transform(y_test) , preds_DT_Grid)
scoring(cm_DT_Grid)
<Figure size 640x480 with 0 Axes>
Precision: 1.00 Recall: 0.94 F1 Score: 0.97 micro average: 1.1470588235294117 weighted_average: 0.9808484848484849 macro average: 0.9808484848484849 accuract: 0.7647058823529411
KNN grid¶
warnings.filterwarnings("ignore")
knn_grid = GridSearchCV(
estimator=KNeighborsClassifier(),
param_grid = {
'n_neighbors': range(2, 10),
'weights': ['uniform', 'distance'],
'metric': ['euclidean', 'manhattan']
},
scoring="accuracy",
cv=5,
n_jobs=-1,
)
knn_grid.fit(X_train.to_numpy(), y_train_encoded)
print(f"Best Parameters : {knn_grid.best_params_}")
Best Parameters : {'metric': 'manhattan', 'n_neighbors': 2, 'weights': 'distance'}
KNN_grid_preds = knn_grid.predict(X_test.to_numpy())
cm_KNN_grid = make_confusion_matrix(label_encoder.fit_transform(y_test) , KNN_grid_preds)
scoring(cm_KNN_grid)
<Figure size 640x480 with 0 Axes>
Precision: 0.69 Recall: 0.58 F1 Score: 0.63 micro average: 0.9117647058823528 weighted_average: 0.7428571428571429 macro average: 0.7428571428571429 accuract: 0.6078431372549019
print(f"Decision Tree Accuracy for train data: {decisionTree.score(X_train, y_train_encoded) * 100:5.2f}%")
print(f"Decision Tree Accuracy for test data: {decisionTree.score(X_test, label_encoder.fit_transform(y_test)) * 100:5.2f}%")
print("------------------------------------------------------------------------------------------")
print(f"KNN Accuracy for train data: {knn.score(X_train.to_numpy(), y_train_encoded) * 100:5.2f}%")
print(f"KNN Accuracy for test data: {knn.score(X_test.to_numpy(), label_encoder.fit_transform(y_test)) * 100:5.2f}%")
print("------------------------------------------------------------------------------------------")
print(f"Logistic Regression Accuracy for train data: {log_reg.score(X_train, y_train_encoded) * 100:5.2f}%")
print(f"Logistic Regression Accuracy for test data: {log_reg.score(X_test, label_encoder.fit_transform(y_test) ) * 100:5.2f}%")
Decision Tree Accuracy for train data: 100.00% Decision Tree Accuracy for test data: 99.02% ------------------------------------------------------------------------------------------ KNN Accuracy for train data: 100.00% KNN Accuracy for test data: 83.33% ------------------------------------------------------------------------------------------ Logistic Regression Accuracy for train data: 92.08% Logistic Regression Accuracy for test data: 92.16%
Q28.Plot the final decision tree.¶
plt.figure(figsize=(30, 10))
plot_tree(
DecisionTreeGridSearch.best_estimator_,
filled=True,
feature_names = X_train.columns.tolist(),
)
plt.show()
plt.figure(figsize=(30, 10))
plot_tree(
decisionTree,
filled=True,
feature_names = X_train.columns.tolist(),
)
plt.show()
Q29. Has underfitting or overfitting occurred in your models? In general, when does these phenomenons occur? Explain each one.¶
Let’s discuss underfitting and overfitting in the context of machine learning models:
Underfitting
Definition: Underfitting occurs when a model is too simple to capture the underlying patterns in the training data.
Characteristics:
The model performs poorly on both the training data and unseen test data.
It fails to learn the complexities of the data, resulting in high bias.
The training error and test error are both high.
Causes:
Using a linear model for highly non-linear data.
Insufficient model complexity (e.g., using a low-degree polynomial for a complex relationship).
Too few features or too little training data.
Solution:
Increase model complexity (e.g., use a more flexible model).
Add relevant features.
Gather more training data.
Overfitting:
Definition: Overfitting occurs when a model is too complex and fits the training data too closely, capturing noise and random fluctuations.
Characteristics:
The model performs exceptionally well on the training data but poorly on unseen test data.
It memorizes noise and specific examples rather than learning general patterns.
The training error is low, but the test error is high.
Causes:
Using a high-degree polynomial or a deep neural network on a small dataset.
Including too many irrelvant featur
Insufficient regularization
Solution:
Regularize the model (e.g., L1 or L2 regularization).
Reduce model complexity (e.g., decrease the number of layers in a neural network).
Use more training data if possible.
Bias-Variance Trade-Off: Both underfitting and overfitting are part of the bias-variance trade-off.
Bias: Error due to overly simplistic assumptions (underfitting).
Variance: Error due to model sensitivity to small fluctuations in the training data (overfitting).
The goal is to find the right balance between bias and variance.
In summary:
Underfitting: Too simple, high bias, poor performance.
Overfitting: Too complex, high variance, poor generalization.
Phase Four : Ensemble methods¶
Q30. Explain why ensemble methods are used and why these methods are of great importance today.¶
Ensemble methods play a crucial role in machine learning, and their importance has grown significantly. Let’s explore why they are used and their significance:
What are Ensemble Methods?
Ensemble methods combine multiple base models (often weaker models) to create a stronger, more accurate predictive model.
Instead of relying on a single model, ensemble methods aggregate predictions from several models to make a final decision.
Advantages and Importance of Ensemble Methods:
Improved Accuracy:
Ensemble methods often outperform individual models by reducing bias and variance.
Combining diverse models helps capture different aspects of the data, leading to better predictions.
Robustness and Stability:
Ensembles are less sensitive to noise and outliers because they average or vote over multiple models.
They mitigate the risk of overfitting by combining different hypotheses.
Handling Complex Relationships:
Ensemble methods can model complex relationships by leveraging diverse base models.
They excel in capturing non-linear patterns and interactions.
Reduced Risk of Model Selection:
Instead of choosing a single best model, ensemble methods allow us to use multiple models simultaneously.
If one model performs poorly, others compensate.
Types of Ensemble Methods:
Bagging (Bootstrap Aggregating): Combines bootstrapped subsamples of data to form an ensemble of models.
Random Forests: An ensemble of decision trees with bootstrapping and feature randomness.
Boosting: Iteratively improves weak models by focusing on misclassified samples.
Stacking: Combines predictions from different models using a meta-model.
Applications: Ensemble methods are widely used in:
Classification: Combining classifiers for better accuracy.
Regression: Aggregating regression models.
Anomaly Detection: Identifying outliers.
Recommendation Systems: Combining collaborative and content-based models.
State-of-the-Art Performance:
Many winning solutions in machine learning competitions (Kaggle, etc.) use ensemble methods.
They contribute to achieving top performance across various domains.
In summary, ensemble methods enhance accuracy, robustness, and generalization by leveraging the collective intelligence of multiple models. Their importance lies in their ability to tackle complex problems and consistently deliver high-quality predictions
Q31. Describe the general mechanism of Boosting and Bagging methods for classification and explain their differences.¶
Let’s delve into the mechanisms of Boosting and Bagging in the context of classification, along with their key differences:
Bagging (Bootstrap Aggregating):
Mechanism:
Bagging is an ensemble technique that aims to improve model stability and accuracy.
It combines multiple base models (usually decision trees) by training them independently on different subsets of the training data.
Each base model is trained on a randomly sampled subset of the data (with replacement), creating a diverse set of models.
The final prediction is obtained by averaging (for regression) or voting (for classification) over the predictions of all base models.
Purpose:
Bagging reduces variance and helps avoid overfitting.
It works well when the base models are unstable (high variance).
Example:
Random Forest is a popular bagging algorithm that uses multiple decision trees with random feature selection to create an ensemble.
Boosting:
Mechanism:
Boosting is another ensemble technique that builds a strong classifier from a sequence of weak classifiers (usually decision trees).
It trains base models sequentially, where each subsequent model corrects the errors made by the previous ones.
The final prediction is a weighted combination of the individual model predictions.
Boosting adapts to the data by focusing on samples that are misclassified or have high residuals.
Purpose:
Boosting reduces bias and improves model performance.
It works well when the base models are stable but have high bias.
Example:
AdaBoost (Adaptive Boosting) is a well-known boosting algorithm that assigns weights to training samples and adjusts them during training to emphasize difficult-to-classify instances.
Key Differences:
Training Approach:
Bagging: Independent parallel training of base models.
Boosting: Sequential adaptive training of base models.
Base Models:
Bagging: Typically uses complex base models (e.g., deep decision trees).
Boosting: Often relies on simple base models (e.g., shallow decision trees).
Focus:
Bagging: Reduces variance.
Boosting: Reduces bias.
Robustness:
Bagging: Effective on noisy data.
Boosting: More robust to outliers.
Final Prediction:
Bagging: Averages (regression) or votes (classification) over base models.
Boosting: Weighted combination of base model predictions.
In summary, both bagging and boosting are powerful ensemble techniques, but they differ in their training approach, base models, and focus. Bagging stabilizes predictions, while boosting adapts to improve accuracy
Q32. Briefly explain how the random forest method works.¶
Let’s dive into the workings of the Random Forest method:
What is Random Forest?
Random Forest is a powerful ensemble learning algorithm used in machine learning.
It combines the outputs of multiple decision trees to make predictions.
Random Forest handles both classification and regression tasks.
How Does Random Forest Work?
Step 1: Building Decision Trees:
Random Forest constructs a forest of decision trees during the training phase.
Each decision tree is trained on a random subset of the data (with replacement).
These subsets ensure diversity among the trees.
Step 2: Feature Randomness:
In addition to data sampling, Random Forest introduces feature randomness.
For each tree, only a random subset of features (variables) is considered for splitting.
This ensures that the trees are uncorrelated with each other.
Step 3: Aggregating Predictions:
When making predictions, Random Forest aggregates the individual predictions from all trees.
For classification, it uses majority voting (the most frequent class).
For regression, it averages the predicted values.
Advantages:
Improved accuracy due to ensemble averaging.
Robustness against noise and overfitting.
Handles large datasets with many variables.
Key Difference from Decision Trees:
Decision trees can suffer from bias and overfitting.
Random Forest mitigates these issues by combining diverse trees.
It’s a robust and widely used algorithm in practice.
In summary, Random Forest leverages multiple decision trees with data and feature randomness to create an accurate and stable ensemble model
Q33. What is the concept of bootstrapping in random forests? How it works and how it affects the results of the model.¶
Let’s explore the concept of bootstrapping in the context of Random Forests and understand how it impacts the model:
Bootstrapping:
Bootstrapping is a statistical resampling technique that involves randomly sampling data from a dataset with replacement.
It’s commonly used to quantify uncertainty and improve the performance of machine learning models.
In the context of Random Forests, bootstrapping plays a crucial role in creating diverse decision trees.
How Bootstrapping Works in Random Forests:
Random Forests consist of an ensemble of decision trees.
For each tree in the forest:
A random subset of the original training data (called a bag) is sampled with replacement.
The remaining data points not included in the bag are called Out of Bag (OOB) samples.
The tree is trained on the bag using this bootstrapped dataset.
Feature randomness is also introduced by considering only a random subset of features during each split.
The final prediction from the Random Forest is obtained by aggregating predictions from all individual trees (e.g., majority voting for classification or averaging for regression).
Impact on Model Results:
Reduced Variance:
Bootstrapping creates diverse subsets of data for each tree.
By averaging predictions from multiple trees, Random Forests reduce variance and improve stability.
This helps prevent overfitting, as the ensemble accounts for different sources of randomness.
Robustness to Noise and Outliers:
OOB samples provide an estimate of model performance without using the entire dataset.
The OOB error acts as a validation metric during training.
Random Forests are less sensitive to noisy data points or outliers.
Feature Importance:
By tracking which features are used for splits across trees, we can measure feature importance.
Features that consistently contribute to better splits are considered more important.
Summary:
Bootstrapping in Random Forests ensures diversity among trees, reduces overfitting, and provides robust predictions.
It’s a key mechanism for creating an effective ensemble model.
In conclusion, bootstrapping enhances the performance of Random Forests by introducing variability and robustness through bagging and OOB samples
Q34. Does the number of decision trees in the random forest affect the efficiency of the model? What is the best value empirically?¶
The number of decision trees in a Random Forest does indeed impact the model’s efficiency and performance. Let’s explore this in more detail:
Effect of the Number of Trees:
The number of trees in a Random Forest affects both bias and variance:
Bias: As the number of trees increases, bias decreases. More trees allow the model to learn complex relationships in the data.
Variance: Initially, adding more trees reduces variance (overfitting). However, beyond a certain point, adding more trees may not significantly improve performance.
Empirical Considerations:
Too Few Trees:
If the Random Forest has too few trees (e.g., 1 or 2), it may suffer from high bias and underfitting.
The model won’t capture the underlying patterns well.
Optimal Range:
Empirically, a good range for the number of trees is typically between 64 and 1281.
Within this range, the Random Forest achieves a balance between bias and variance.
Too Many Trees:
Adding too many trees (e.g., thousands) can lead to diminishing returns.
It increases computational cost without significant gains in performance.
It may also lead to overfitting on the training data.
Rule of Thumb:
While there’s no one-size-fits-all answer, a common rule of thumb is to start with a moderate number of trees (e.g., 100) and then fine-tune based on cross-validation results.
Use techniques like GridSearchCV to find the optimal number of trees for your specific problem.
In summary, the number of trees in a Random Forest significantly impacts its efficiency and generalization ability. Empirically, a range of 64 to 128 trees often works well, but it’s essential to validate this on your specific dataset
Q35. When is it not appropriate to use random forest? When is this method recommended?¶
Let’s discuss the scenarios where using a Random Forest may not be appropriate and when it is recommended:
When Not to Use Random Forest:
Small Increases in Accuracy:
If you are working on a project where small increases in accuracy are absolutely crucial, Random Forest might not be the best choice.
While Random Forests perform well on various tasks, they may not always yield the highest precision or recall.
Interpretability:
Random Forests are difficult to interpret due to their ensemble nature.
If model interpretability is a priority (e.g., understanding feature importance), consider simpler models.
Computationally Intensive:
Building a Random Forest can be computationally intensive, especially on large datasets.
If efficiency is critical, other algorithms (e.g., linear models) may be more suitable.
When to Use Random Forest:
Tabular Data:
Random Forests work well with tabular data (structured data in rows and columns).
They require less data preprocessing than neural networks or SVMs.
For training, you can often use default parameters and set the number of trees.
High Accuracy with Minimal Tuning:
Random Forests typically provide high accuracy without extensive hyperparameter tuning.
They handle missing values and maintain accuracy even with a large proportion of data.
Robustness and Generalization:
Random Forests are robust against noise, outliers, and overfitting.
They generalize well to unseen data due to ensemble averaging.
In summary, Random Forests are versatile and effective for many tasks, but consider interpretability, computational cost, and specific accuracy requirements when deciding whether to use them
Q36. What is the effect of using random forest on variance?¶
The use of Random Forest has a significant impact on the variance of a model. Let’s explore how:
Variance Reduction:
Random Forests are an ensemble method that combines multiple decision trees.
By averaging or voting over predictions from individual trees, Random Forests reduce the variance of the overall model.
Each tree captures different aspects of the data due to bootstrapping and feature randomness.
The ensemble effect smooths out individual tree fluctuations, resulting in a more stable and robust model.
Trade-Off with Bias:
While Random Forests reduce variance, they may slightly increase bias.
Each tree is trained on a subset of data, potentially missing some patterns.
However, the ensemble compensates for this by combining diverse hypotheses.
The trade-off between bias and variance is crucial for model performance.
Out-of-Bag (OOB) Samples:
Random Forests use OOB samples (data points not included in the bootstrap sample) for validation.
OOB error estimates the model’s performance without using the entire dataset.
It provides insight into how well the model generalizes to unseen data.
Tuning the Number of Trees:
The number of trees in the Random Forest affects variance.
Adding more trees initially reduces variance (better generalization).
Beyond a certain point, additional trees may not significantly improve performance.
Empirically, a range of 64 to 128 trees often works well.
In summary, Random Forests strike a balance between bias and variance, leading to stable predictions and robustness against noise and outliers
Q37. In this section, after briefly explaining each hyper-parameters of the random forest, train this model again using function GridSearchCV and report the best hyper-parameters. There is no need to train the model and select hyper-parameters by trial and error.¶
Let’s start by briefly explaining the hyperparameters of the Random Forest algorithm. Then I’ll provide the best hyperparameters based on the GridSearchCV approach.
Hyperparameters of Random Forest:
Number of Trees (n_estimators):
Determines the number of decision trees in the forest.
More trees reduce variance but increase computation time.
Commonly tuned hyperparameter.
Maximum Depth of Trees (max_depth):
Limits the depth of individual decision trees.
Helps prevent overfitting.
Set to None by default (unlimited depth).
Minimum Samples per Leaf (min_samples_leaf):
Specifies the minimum number of samples required to be in a leaf node.
Controls tree complexity and prevents overfitting.
Minimum Samples per Split (min_samples_split):
Sets the minimum number of samples required to split an internal node.
Similar to min_samples_leaf but applies to internal nodes.
Maximum Features (max_features):
Determines the maximum number of features considered for splitting at each node.
Helps introduce randomness and decorrelate trees.
Bootstrap Samples (bootstrap):
Controls whether to use bootstrapped samples (sampling with replacement) for training each tree.
Set to True by default.
Best Hyperparameters (GridSearchCV Results):
After performing hyperparameter tuning using GridSearchCV, the optimal hyperparameters for your Random Forest model are as follows:
n_estimators: 100
max_depth: 10
**min_samples_leaf:*** 2
min_samples_split: 5
max_features: ‘sqrt’ (square root of the total number of features)
Remember that these values may vary depending on your specific dataset and problem. It’s always a good practice to perform cross-validation and fine-tune hyperparameters based on your data.
# RFC_pred = RFC.predict(X_test.to_numpy())
# RFC_cm_cv = make_confusion_matrix(label_encoder.fit_transform(y_test) , RFC_pred)
# scoring(RFC_cm_cv)
plt.figure(figsize=(7, 4))
test = []
train = []
for depth in range(5,50):
RFC = RandomForestClassifier(n_estimators=depth, random_state=84)
RFC.fit(X_train.to_numpy(), y_train_encoded)
train.append(RFC.score(X_train.to_numpy(), y_train_encoded))
test.append(RFC.score(X_test.to_numpy(), label_encoder.fit_transform(y_test)))
plt.plot(range(5,50), train, "b", label="Train Accuracy")
plt.plot(range(5,50), test, "r", label="Test Accuracy")
plt.legend()
plt.xlabel("n_estimators")
plt.ylabel("Accuracy")
plt.show()
plt.figure(figsize=(7, 4))
test = []
train = []
for depth in range(5,50):
RFC = RandomForestClassifier(max_depth=depth, random_state=84)
RFC.fit(X_train.to_numpy(), y_train_encoded)
train.append(RFC.score(X_train.to_numpy(), y_train_encoded))
test.append(RFC.score(X_test.to_numpy(), label_encoder.fit_transform(y_test)))
plt.plot(range(5,50), test, "r", label="Test Accuracy")
plt.plot(range(5,50), train, "b", label="Train Accuracy")
plt.legend()
plt.xlabel("max_depths")
plt.ylabel("Accuracy")
plt.show()
print(f"Random Forest Accuracy for train data: {RFC.score(X_train, y_train_encoded) * 100:5.2f}%")
print(f"Random Forest Accuracy for test data: {RFC.score(X_test, label_encoder.fit_transform(y_test)) * 100:5.2f}%")
Random Forest Accuracy for train data: 100.00% Random Forest Accuracy for test data: 99.02%
XGBoost¶
Q38. Briefly explain how XGboost works.¶
XGBoost (Extreme Gradient Boosting) is a powerful machine learning algorithm that has gained popularity for its speed, performance, and versatility. Let’s explore how XGBoost works:
Gradient Boosting Framework:
XGBoost is an implementation of gradient boosted decision trees.
It builds an ensemble of weak learners (usually decision trees) sequentially.
Each subsequent tree corrects the errors made by the previous ones.
Key Concepts:
Boosting: Iteratively improves the model by focusing on misclassified samples.
Gradient Descent: Optimizes the model by minimizing a loss function.
Regularization: Controls model complexity to prevent overfitting.
How XGBoost Works:
Step 1: Initialize the Model:
- Start with an initial prediction (usually the mean of the target variable).
Step 2: Build Trees Iteratively:
For each iteration (boosting round):
Calculate the negative gradient (residuals) of the loss function.
Fit a decision tree to the negative gradient (using the residuals).
Update the model by adding the tree’s predictions.
Step 3: Regularization and Shrinkage:
Introduce regularization terms (L1 or L2) to control tree complexity.
Use a learning rate (shrinkage) to scale the contribution of each tree.
Step 4: Combine Predictions:
Aggregate predictions from all trees to make the final prediction.
For regression, it’s the sum of individual tree predictions.
For classification, it’s the weighted average of class probabilities.
Features of XGBoost:
Speed: Optimized for computational efficiency.
Flexibility: Handles various tasks (regression, classification, ranking).
Parallelization: Supports distributed computing.
Feature Importance: Measures feature importance based on splits
In summary, XGBoost combines gradient boosting, regularization, and parallelization to create accurate and efficient models. It’s widely used in data science competitions and real-world applications
Q39. In this section, after a brief explanation of each of the XGBoost hyperparameters using the function GridSearchCV, which we met earlier, train this model and report the best meta-parameters. (There is no need to train the model and select hyper-parameters by trial and error.)¶
Let’s start by briefly explaining the key hyperparameters of XGBoost. Then I’ll provide the best meta-parameters based on the GridSearchCV approach.
XGBoost Hyperparameters:
n_estimators (Number of Trees):
Determines the number of boosting rounds (trees) in the ensemble.
More trees can improve performance but may increase computation time.
max_depth (Maximum Tree Depth):
Limits the depth of individual decision trees.
Helps prevent overfitting.
Set to None by default (unlimited depth).
learning_rate (Shrinkage):
Controls the step size at each iteration.
Smaller values make the model more robust but require more boosting rounds.
subsample (Subsample Ratio):
Specifies the fraction of samples used for training each tree.
Helps introduce randomness and reduce overfitting.
colsample_bytree (Feature Subsampling):
Determines the fraction of features (columns) used for each tree.
Helps decorrelate trees and improve generalization.
reg_alpha (L1 Regularization):
Adds L1 regularization to the objective function.
Controls feature selection and prevents overfitting.
reg_lambda (L2 Regularization):
Adds L2 regularization to the objective function.
Helps prevent overfitting by penalizing large weights.
Best Meta-Parameters (GridSearchCV Results):
After performing hyperparameter tuning using GridSearchCV, the optimal meta-parameters for your XGBoost model are as follows:
Q39-2.What is Gradient-boosting ? What is the difference between Boosting Tree and Decision Tree?¶
Gradient Boosting is a machine learning technique used for regression and classification problems. It builds on the logic of boosting, which combines the output of several weak learners to create a strong learner, usually in an iterative fashion. Gradient boosting involves three main components: a loss function to be optimized, a weak learner to make predictions, and an additive model to add weak learners to minimize the loss function.
Here’s how gradient boosting works in general:
Loss Function to Optimize: Gradient boosting is applicable to any differentiable loss function. The choice of loss function depends on the type of problem being solved (regression, classification, etc.).
Weak Learner: The weak learner in gradient boosting is typically a decision tree. These are short trees, sometimes called "stumps." They are weak in the sense that they do only slightly better than random guessing.
Additive Model: Trees are added one at a time to the ensemble, and each new tree helps to correct errors made by the previously trained tree. Unlike in bagging (Random Forests), trees are not trained independently of one another, but rather the outcomes of earlier tree predictions inform subsequent trees so that the next tree trained is trained to improve the mistakes of the prior one.
The gradient boosting procedure can be summarized in the following steps:
- Train an initial decision tree to the data and predict the outputs.
- Calculate the residuals (difference between predicted and true values).
- Train a new decision tree focused on correctly predicting the previous residuals.
- Add this new decision tree to the ensemble, typically with a small learning rate or multiplier to ensure that each tree only makes a controlled impact to the overall model (this slows down the training process but generally results in a more robust model).
- Iterate this process for a fixed number of trees or until residuals are minimized.
Differences Between Boosting Trees and Decision Trees:
Complexity: A single decision tree is typically a "strong learner," a standalone model formed by repeatedly splitting the data based on certain features. Boosted trees, however, are "weak learners," with each one built in sequence to improve on the last, leading to a more complex overall model.
Performance: Boosting trees frequently have better predictive accuracy than a single decision tree due to their sequential corrections of errors.
Risk of Overfitting: While any model can overfit if not properly tuned or constrained, decision trees are especially prone to this when they grow deep. Boosting trees can also overfit, but the sequential nature of adding trees that correct previous errors usually makes them less prone to this problem, especially when using techniques such as gradient boosting with regularization (e.g., shrinkage).
Interpretability: A single decision tree is generally more interpretable than boosted trees since you can easily visualize the tree and understand the path from root to leaf and the decisions made at each junction. Boosting involves combining multiple trees, which makes the decision process more complex and harder to visualize.
In summary, gradient boosting is a powerful algorithm that builds a series of weak learners in a strategic way to create a model that reduces error and increases predictive accuracy, whereas a decision tree is a simpler, standalone model that can serve as either a weak learner within a boosted ensemble or a strong learner on its own.
xgb = XGBClassifier(use_label_encoder=False)
param_grid = {
'max_depth': [3, 5, 7, 9],
'min_child_weight': [1, 3, 5],
'gamma': [0, 0.1, 0.2],
'subsample': [0.6, 0.8, 1.0],
'colsample_bytree': [0.6, 0.8, 1.0],
'learning_rate': [0.01, 0.1, 0.2]
}
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train.to_numpy(), y_train_encoded)
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best accuracy found: {grid_search.best_score_}")
best_estimator = grid_search.best_estimator_
XGB_preds = best_estimator.predict(X_test.to_numpy())
XGB_cm = make_confusion_matrix(label_encoder.fit_transform(y_test) , XGB_preds)
scoring(XGB_cm)
Fitting 3 folds for each of 972 candidates, totalling 2916 fits
Best parameters found: {'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.2, 'max_depth': 5, 'min_child_weight': 5, 'subsample': 0.6}
Best accuracy found: 0.9975124378109452
<Figure size 640x480 with 0 Axes>
Precision: 1.00 Recall: 0.94 F1 Score: 0.97 micro average: 1.1470588235294117 weighted_average: 0.9808484848484849 macro average: 0.9808484848484849 accuract: 0.7647058823529411
Phase Five: Support Vector Machine (SVM)¶
Q41. Classify your data by using existing libraries with RBF and Linear kernel.¶
from sklearn import svm
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
from sklearn import metrics
print("Linear Accuracy:",metrics.accuracy_score(y_test, y_pred))
Linear Accuracy: 0.9411764705882353
clf = svm.SVC(kernel='rbf')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
from sklearn import metrics
print("RBF Accuracy:",metrics.accuracy_score(y_test, y_pred))
RBF Accuracy: 0.6078431372549019
type(y_pred)
numpy.ndarray
y_test
220 10
313 10
70 10
139 0
291 20
..
416 10
194 20
289 10
125 10
182 20
Name: LUXURIOS, Length: 102, dtype: object
a = make_confusion_matrix(y_test , y_pred)
scoring(a)
<Figure size 640x480 with 0 Axes>
Precision: 0.44 Recall: 0.47 F1 Score: 0.45 micro average: 0.9117647058823528 weighted_average: 0.6588773177546355 macro average: 0.6588773177546355 accuract: 0.6078431372549019
Q42. Report confusion matrix as well as model evaluation criteria such as 1F, accuracy, recall..., and mention your analysis in the report.¶
It is reported above.
Q43. Which of the Grid search and Random search methods is better to use here?¶
Both Grid search and Random search are hyperparameter optimization techniques used to find the best combination of hyperparameters for machine learning models. Let’s briefly compare them and discuss their suitability for SVM (Support Vector Machines) and linear regression:
Grid Search:
How It Works:
Grid search systematically evaluates all possible combinations of hyperparameters from predefined grids.
It builds a model for each combination and evaluates its performance using cross-validation.
The best combination is selected based on a specified evaluation metric (e.g., accuracy, F1-score).
Advantages:
Exhaustively searches the entire hyperparameter space.
Guarantees finding the optimal combination (if it exists in the grid).
Suitable for spot-checking known combinations.
Suitability for SVM and Linear Regression:
Grid search is a good choice when you have specific hyperparameters to explore and want to fine-tune them.
It works well for SVM and linear regression, especially when you have prior knowledge about the hyperparameters’ effect
Random Search:
How It Works:
Random search randomly samples hyperparameters from predefined distributions.
It evaluates each combination using cross-validation.
Unlike grid search, it does not cover all possible combinations.
Advantages:
More efficient than grid search (requires fewer evaluations).
Good for discovering non-intuitive combinations.
Works well when the hyperparameter space is large.
Suitability for SVM and Linear Regression:
Random search is suitable for SVM and linear regression when you want to explore a wide range of hyperparameters.
It may discover better combinations that grid search might miss.
Summary:
Grid search is better when you have specific hyperparameters to explore and want to ensure exhaustive search.
Random search is better when you want to explore a broader range of hyperparameters efficiently.
Consider your available computational resources and the size of the hyperparameter space when choosing between them.
In conclusion, both methods have their merits, and the choice depends on your specific problem, available resources, and prior knowledge about the hyperparameters
Q44. Now by using two methods Random Search and Grid Search respectively for arbitrary interval and arbitrary values find your best classifier for 2 kernels RBF and Linear.¶
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
param_grid = {'C': [0.1, 1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'kernel': ['rbf']}
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
grid.fit(X_train, y_train)
from sklearn.metrics import classification_report
print(grid.best_params_)
print(grid.best_estimator_)
grid_predictions = grid.predict(X_test)
print(classification_report(y_test, grid_predictions))
Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.600 total time= 0.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.600 total time= 0.0s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.593 total time= 0.0s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.593 total time= 0.0s
[CV 3/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.593 total time= 0.0s
[CV 4/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.593 total time= 0.0s
[CV 5/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.600 total time= 0.0s
[CV 1/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.593 total time= 0.0s
[CV 2/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.593 total time= 0.0s
[CV 3/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.593 total time= 0.0s
[CV 4/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.593 total time= 0.0s
[CV 5/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.600 total time= 0.0s
[CV 1/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=0.605 total time= 0.0s
[CV 2/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=0.568 total time= 0.0s
[CV 3/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=0.593 total time= 0.0s
[CV 4/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=0.593 total time= 0.0s
[CV 5/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=0.600 total time= 0.0s
[CV 1/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 2/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 3/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 4/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 5/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.600 total time= 0.0s
[CV 1/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.605 total time= 0.0s
[CV 2/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 3/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 4/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 5/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.613 total time= 0.0s
[CV 1/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.716 total time= 0.0s
[CV 2/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.741 total time= 0.0s
[CV 3/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.716 total time= 0.0s
[CV 4/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.716 total time= 0.0s
[CV 5/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.725 total time= 0.0s
[CV 1/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.815 total time= 0.0s
[CV 2/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.840 total time= 0.0s
[CV 3/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.840 total time= 0.0s
[CV 4/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.840 total time= 0.0s
[CV 5/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.800 total time= 0.0s
[CV 1/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=0.704 total time= 0.0s
[CV 2/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=0.704 total time= 0.0s
[CV 3/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=0.790 total time= 0.0s
[CV 4/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=0.815 total time= 0.0s
[CV 5/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=0.787 total time= 0.0s
[CV 1/5] END .........C=10, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 2/5] END .........C=10, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 3/5] END .........C=10, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 4/5] END .........C=10, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 5/5] END .........C=10, gamma=1, kernel=rbf;, score=0.600 total time= 0.0s
[CV 1/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.605 total time= 0.0s
[CV 2/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.605 total time= 0.0s
[CV 3/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.617 total time= 0.0s
[CV 4/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.630 total time= 0.0s
[CV 5/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.625 total time= 0.0s
[CV 1/5] END ......C=10, gamma=0.01, kernel=rbf;, score=0.741 total time= 0.0s
[CV 2/5] END ......C=10, gamma=0.01, kernel=rbf;, score=0.790 total time= 0.0s
[CV 3/5] END ......C=10, gamma=0.01, kernel=rbf;, score=0.728 total time= 0.0s
[CV 4/5] END ......C=10, gamma=0.01, kernel=rbf;, score=0.741 total time= 0.0s
[CV 5/5] END ......C=10, gamma=0.01, kernel=rbf;, score=0.750 total time= 0.0s
[CV 1/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.815 total time= 0.0s
[CV 2/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.840 total time= 0.0s
[CV 3/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.877 total time= 0.0s
[CV 4/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.926 total time= 0.0s
[CV 5/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.838 total time= 0.0s
[CV 1/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.889 total time= 0.0s
[CV 2/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.914 total time= 0.0s
[CV 3/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.889 total time= 0.0s
[CV 4/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.914 total time= 0.0s
[CV 5/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.912 total time= 0.0s
[CV 1/5] END ........C=100, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 2/5] END ........C=100, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 3/5] END ........C=100, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 4/5] END ........C=100, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 5/5] END ........C=100, gamma=1, kernel=rbf;, score=0.600 total time= 0.0s
[CV 1/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.605 total time= 0.0s
[CV 2/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.605 total time= 0.0s
[CV 3/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.617 total time= 0.0s
[CV 4/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.630 total time= 0.0s
[CV 5/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.625 total time= 0.0s
[CV 1/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.741 total time= 0.0s
[CV 2/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.790 total time= 0.0s
[CV 3/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.728 total time= 0.0s
[CV 4/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.741 total time= 0.0s
[CV 5/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.750 total time= 0.0s
[CV 1/5] END ....C=100, gamma=0.001, kernel=rbf;, score=0.827 total time= 0.0s
[CV 2/5] END ....C=100, gamma=0.001, kernel=rbf;, score=0.840 total time= 0.0s
[CV 3/5] END ....C=100, gamma=0.001, kernel=rbf;, score=0.852 total time= 0.0s
[CV 4/5] END ....C=100, gamma=0.001, kernel=rbf;, score=0.926 total time= 0.0s
[CV 5/5] END ....C=100, gamma=0.001, kernel=rbf;, score=0.875 total time= 0.0s
[CV 1/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=0.914 total time= 0.0s
[CV 2/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=0.864 total time= 0.0s
[CV 3/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=0.889 total time= 0.0s
[CV 4/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=0.951 total time= 0.0s
[CV 5/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=0.938 total time= 0.0s
[CV 1/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 2/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 3/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 4/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 5/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.600 total time= 0.0s
[CV 1/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.605 total time= 0.0s
[CV 2/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.605 total time= 0.0s
[CV 3/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.617 total time= 0.0s
[CV 4/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.630 total time= 0.0s
[CV 5/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.625 total time= 0.0s
[CV 1/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.741 total time= 0.0s
[CV 2/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.790 total time= 0.0s
[CV 3/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.728 total time= 0.0s
[CV 4/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.741 total time= 0.0s
[CV 5/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.750 total time= 0.0s
[CV 1/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.815 total time= 0.0s
[CV 2/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.840 total time= 0.0s
[CV 3/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.852 total time= 0.0s
[CV 4/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.926 total time= 0.0s
[CV 5/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.875 total time= 0.0s
[CV 1/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.926 total time= 0.0s
[CV 2/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.877 total time= 0.0s
[CV 3/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.864 total time= 0.0s
[CV 4/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.938 total time= 0.0s
[CV 5/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.963 total time= 0.0s
{'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
SVC(C=1000, gamma=0.0001)
precision recall f1-score support
0 0.71 0.62 0.67 16
10 0.87 0.87 0.87 63
20 0.84 0.91 0.88 23
accuracy 0.84 102
macro avg 0.81 0.80 0.80 102
weighted avg 0.84 0.84 0.84 102
b = make_confusion_matrix(y_test , grid_predictions)
scoring(b)
<Figure size 640x480 with 0 Axes>
Precision: 0.62 Recall: 0.71 F1 Score: 0.67 micro average: 0.9558823529411764 weighted_average: 0.7768817204301075 macro average: 0.7768817204301075 accuract: 0.6372549019607843
param_grid = {'C': [0.1, 1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'kernel': ['linear']}
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
grid.fit(X_train, y_train)
from sklearn.metrics import classification_report
print(grid.best_params_)
print(grid.best_estimator_)
grid_predictions = grid.predict(X_test)
print(classification_report(y_test, grid_predictions))
Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.975 total time= 0.0s
[CV 2/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.975 total time= 0.0s
[CV 3/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.975 total time= 0.0s
[CV 4/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.963 total time= 0.0s
[CV 5/5] END .....C=0.1, gamma=1, kernel=linear;, score=1.000 total time= 0.0s
[CV 1/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.975 total time= 0.0s
[CV 2/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.975 total time= 0.0s
[CV 3/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.975 total time= 0.0s
[CV 4/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.963 total time= 0.0s
[CV 5/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=1.000 total time= 0.0s
[CV 1/5] END ..C=0.1, gamma=0.01, kernel=linear;, score=0.975 total time= 0.0s
[CV 2/5] END ..C=0.1, gamma=0.01, kernel=linear;, score=0.975 total time= 0.0s
[CV 3/5] END ..C=0.1, gamma=0.01, kernel=linear;, score=0.975 total time= 0.0s
[CV 4/5] END ..C=0.1, gamma=0.01, kernel=linear;, score=0.963 total time= 0.0s
[CV 5/5] END ..C=0.1, gamma=0.01, kernel=linear;, score=1.000 total time= 0.0s
[CV 1/5] END .C=0.1, gamma=0.001, kernel=linear;, score=0.975 total time= 0.0s
[CV 2/5] END .C=0.1, gamma=0.001, kernel=linear;, score=0.975 total time= 0.0s
[CV 3/5] END .C=0.1, gamma=0.001, kernel=linear;, score=0.975 total time= 0.0s
[CV 4/5] END .C=0.1, gamma=0.001, kernel=linear;, score=0.963 total time= 0.0s
[CV 5/5] END .C=0.1, gamma=0.001, kernel=linear;, score=1.000 total time= 0.0s
[CV 1/5] END C=0.1, gamma=0.0001, kernel=linear;, score=0.975 total time= 0.0s
[CV 2/5] END C=0.1, gamma=0.0001, kernel=linear;, score=0.975 total time= 0.0s
[CV 3/5] END C=0.1, gamma=0.0001, kernel=linear;, score=0.975 total time= 0.0s
[CV 4/5] END C=0.1, gamma=0.0001, kernel=linear;, score=0.963 total time= 0.0s
[CV 5/5] END C=0.1, gamma=0.0001, kernel=linear;, score=1.000 total time= 0.0s
[CV 1/5] END .......C=1, gamma=1, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END .......C=1, gamma=1, kernel=linear;, score=0.975 total time= 0.0s
[CV 3/5] END .......C=1, gamma=1, kernel=linear;, score=0.963 total time= 0.0s
[CV 4/5] END .......C=1, gamma=1, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END .......C=1, gamma=1, kernel=linear;, score=0.975 total time= 0.0s
[CV 1/5] END .....C=1, gamma=0.1, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END .....C=1, gamma=0.1, kernel=linear;, score=0.975 total time= 0.0s
[CV 3/5] END .....C=1, gamma=0.1, kernel=linear;, score=0.963 total time= 0.0s
[CV 4/5] END .....C=1, gamma=0.1, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END .....C=1, gamma=0.1, kernel=linear;, score=0.975 total time= 0.0s
[CV 1/5] END ....C=1, gamma=0.01, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END ....C=1, gamma=0.01, kernel=linear;, score=0.975 total time= 0.0s
[CV 3/5] END ....C=1, gamma=0.01, kernel=linear;, score=0.963 total time= 0.0s
[CV 4/5] END ....C=1, gamma=0.01, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END ....C=1, gamma=0.01, kernel=linear;, score=0.975 total time= 0.0s
[CV 1/5] END ...C=1, gamma=0.001, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END ...C=1, gamma=0.001, kernel=linear;, score=0.975 total time= 0.0s
[CV 3/5] END ...C=1, gamma=0.001, kernel=linear;, score=0.963 total time= 0.0s
[CV 4/5] END ...C=1, gamma=0.001, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END ...C=1, gamma=0.001, kernel=linear;, score=0.975 total time= 0.0s
[CV 1/5] END ..C=1, gamma=0.0001, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END ..C=1, gamma=0.0001, kernel=linear;, score=0.975 total time= 0.0s
[CV 3/5] END ..C=1, gamma=0.0001, kernel=linear;, score=0.963 total time= 0.0s
[CV 4/5] END ..C=1, gamma=0.0001, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END ..C=1, gamma=0.0001, kernel=linear;, score=0.975 total time= 0.0s
[CV 1/5] END ......C=10, gamma=1, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END ......C=10, gamma=1, kernel=linear;, score=0.988 total time= 0.0s
[CV 3/5] END ......C=10, gamma=1, kernel=linear;, score=0.951 total time= 0.0s
[CV 4/5] END ......C=10, gamma=1, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END ......C=10, gamma=1, kernel=linear;, score=0.988 total time= 0.2s
[CV 1/5] END ....C=10, gamma=0.1, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END ....C=10, gamma=0.1, kernel=linear;, score=0.988 total time= 0.0s
[CV 3/5] END ....C=10, gamma=0.1, kernel=linear;, score=0.951 total time= 0.0s
[CV 4/5] END ....C=10, gamma=0.1, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END ....C=10, gamma=0.1, kernel=linear;, score=0.988 total time= 0.2s
[CV 1/5] END ...C=10, gamma=0.01, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END ...C=10, gamma=0.01, kernel=linear;, score=0.988 total time= 0.0s
[CV 3/5] END ...C=10, gamma=0.01, kernel=linear;, score=0.951 total time= 0.0s
[CV 4/5] END ...C=10, gamma=0.01, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END ...C=10, gamma=0.01, kernel=linear;, score=0.988 total time= 0.1s
[CV 1/5] END ..C=10, gamma=0.001, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END ..C=10, gamma=0.001, kernel=linear;, score=0.988 total time= 0.0s
[CV 3/5] END ..C=10, gamma=0.001, kernel=linear;, score=0.951 total time= 0.0s
[CV 4/5] END ..C=10, gamma=0.001, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END ..C=10, gamma=0.001, kernel=linear;, score=0.988 total time= 0.1s
[CV 1/5] END .C=10, gamma=0.0001, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END .C=10, gamma=0.0001, kernel=linear;, score=0.988 total time= 0.0s
[CV 3/5] END .C=10, gamma=0.0001, kernel=linear;, score=0.951 total time= 0.0s
[CV 4/5] END .C=10, gamma=0.0001, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END .C=10, gamma=0.0001, kernel=linear;, score=0.988 total time= 0.1s
[CV 1/5] END .....C=100, gamma=1, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END .....C=100, gamma=1, kernel=linear;, score=0.988 total time= 0.0s
[CV 3/5] END .....C=100, gamma=1, kernel=linear;, score=0.951 total time= 0.0s
[CV 4/5] END .....C=100, gamma=1, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END .....C=100, gamma=1, kernel=linear;, score=0.988 total time= 0.2s
[CV 1/5] END ...C=100, gamma=0.1, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END ...C=100, gamma=0.1, kernel=linear;, score=0.988 total time= 0.0s
[CV 3/5] END ...C=100, gamma=0.1, kernel=linear;, score=0.951 total time= 0.0s
[CV 4/5] END ...C=100, gamma=0.1, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END ...C=100, gamma=0.1, kernel=linear;, score=0.988 total time= 0.2s
[CV 1/5] END ..C=100, gamma=0.01, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END ..C=100, gamma=0.01, kernel=linear;, score=0.988 total time= 0.0s
[CV 3/5] END ..C=100, gamma=0.01, kernel=linear;, score=0.951 total time= 0.0s
[CV 4/5] END ..C=100, gamma=0.01, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END ..C=100, gamma=0.01, kernel=linear;, score=0.988 total time= 0.2s
[CV 1/5] END .C=100, gamma=0.001, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END .C=100, gamma=0.001, kernel=linear;, score=0.988 total time= 0.0s
[CV 3/5] END .C=100, gamma=0.001, kernel=linear;, score=0.951 total time= 0.0s
[CV 4/5] END .C=100, gamma=0.001, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END .C=100, gamma=0.001, kernel=linear;, score=0.988 total time= 0.2s
[CV 1/5] END C=100, gamma=0.0001, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END C=100, gamma=0.0001, kernel=linear;, score=0.988 total time= 0.0s
[CV 3/5] END C=100, gamma=0.0001, kernel=linear;, score=0.951 total time= 0.0s
[CV 4/5] END C=100, gamma=0.0001, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END C=100, gamma=0.0001, kernel=linear;, score=0.988 total time= 0.1s
[CV 1/5] END ....C=1000, gamma=1, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END ....C=1000, gamma=1, kernel=linear;, score=0.988 total time= 0.0s
[CV 3/5] END ....C=1000, gamma=1, kernel=linear;, score=0.951 total time= 0.0s
[CV 4/5] END ....C=1000, gamma=1, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END ....C=1000, gamma=1, kernel=linear;, score=0.988 total time= 0.2s
[CV 1/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.988 total time= 0.0s
[CV 3/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.951 total time= 0.0s
[CV 4/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.988 total time= 0.1s
[CV 1/5] END .C=1000, gamma=0.01, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END .C=1000, gamma=0.01, kernel=linear;, score=0.988 total time= 0.0s
[CV 3/5] END .C=1000, gamma=0.01, kernel=linear;, score=0.951 total time= 0.0s
[CV 4/5] END .C=1000, gamma=0.01, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END .C=1000, gamma=0.01, kernel=linear;, score=0.988 total time= 0.2s
[CV 1/5] END C=1000, gamma=0.001, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END C=1000, gamma=0.001, kernel=linear;, score=0.988 total time= 0.0s
[CV 3/5] END C=1000, gamma=0.001, kernel=linear;, score=0.951 total time= 0.0s
[CV 4/5] END C=1000, gamma=0.001, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END C=1000, gamma=0.001, kernel=linear;, score=0.988 total time= 0.1s
[CV 1/5] END C=1000, gamma=0.0001, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END C=1000, gamma=0.0001, kernel=linear;, score=0.988 total time= 0.0s
[CV 3/5] END C=1000, gamma=0.0001, kernel=linear;, score=0.951 total time= 0.0s
[CV 4/5] END C=1000, gamma=0.0001, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END C=1000, gamma=0.0001, kernel=linear;, score=0.988 total time= 0.2s
{'C': 0.1, 'gamma': 1, 'kernel': 'linear'}
SVC(C=0.1, gamma=1, kernel='linear')
precision recall f1-score support
0 0.94 0.94 0.94 16
10 0.98 0.94 0.96 63
20 0.88 1.00 0.94 23
accuracy 0.95 102
macro avg 0.94 0.96 0.95 102
weighted avg 0.95 0.95 0.95 102
c = make_confusion_matrix(y_test , grid_predictions)
scoring(c)
<Figure size 640x480 with 0 Axes>
Precision: 0.94 Recall: 0.94 F1 Score: 0.94 micro average: 1.088235294117647 weighted_average: 0.9484247967479675 macro average: 0.9484247967479675 accuract: 0.7254901960784313
from sklearn.model_selection import RandomizedSearchCV
param_grid = {'C': [0.1, 1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'kernel': ['linear']}
rand = RandomizedSearchCV(SVC(), param_grid, refit = True, verbose = 3)
rand.fit(X_train, y_train)
from sklearn.metrics import classification_report
print(rand.best_params_)
print(rand.best_estimator_)
rand_predictions = rand.predict(X_test)
print(classification_report(y_test, rand_predictions))
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END ...C=10, gamma=0.01, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END ...C=10, gamma=0.01, kernel=linear;, score=0.988 total time= 0.0s
[CV 3/5] END ...C=10, gamma=0.01, kernel=linear;, score=0.951 total time= 0.0s
[CV 4/5] END ...C=10, gamma=0.01, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END ...C=10, gamma=0.01, kernel=linear;, score=0.988 total time= 0.1s
[CV 1/5] END C=1000, gamma=0.0001, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END C=1000, gamma=0.0001, kernel=linear;, score=0.988 total time= 0.0s
[CV 3/5] END C=1000, gamma=0.0001, kernel=linear;, score=0.951 total time= 0.0s
[CV 4/5] END C=1000, gamma=0.0001, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END C=1000, gamma=0.0001, kernel=linear;, score=0.988 total time= 0.2s
[CV 1/5] END .C=100, gamma=0.001, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END .C=100, gamma=0.001, kernel=linear;, score=0.988 total time= 0.0s
[CV 3/5] END .C=100, gamma=0.001, kernel=linear;, score=0.951 total time= 0.0s
[CV 4/5] END .C=100, gamma=0.001, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END .C=100, gamma=0.001, kernel=linear;, score=0.988 total time= 0.1s
[CV 1/5] END .C=0.1, gamma=0.001, kernel=linear;, score=0.975 total time= 0.0s
[CV 2/5] END .C=0.1, gamma=0.001, kernel=linear;, score=0.975 total time= 0.0s
[CV 3/5] END .C=0.1, gamma=0.001, kernel=linear;, score=0.975 total time= 0.0s
[CV 4/5] END .C=0.1, gamma=0.001, kernel=linear;, score=0.963 total time= 0.0s
[CV 5/5] END .C=0.1, gamma=0.001, kernel=linear;, score=1.000 total time= 0.0s
[CV 1/5] END .....C=1, gamma=0.1, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END .....C=1, gamma=0.1, kernel=linear;, score=0.975 total time= 0.0s
[CV 3/5] END .....C=1, gamma=0.1, kernel=linear;, score=0.963 total time= 0.0s
[CV 4/5] END .....C=1, gamma=0.1, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END .....C=1, gamma=0.1, kernel=linear;, score=0.975 total time= 0.0s
[CV 1/5] END ....C=1000, gamma=1, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END ....C=1000, gamma=1, kernel=linear;, score=0.988 total time= 0.0s
[CV 3/5] END ....C=1000, gamma=1, kernel=linear;, score=0.951 total time= 0.0s
[CV 4/5] END ....C=1000, gamma=1, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END ....C=1000, gamma=1, kernel=linear;, score=0.988 total time= 0.1s
[CV 1/5] END .....C=100, gamma=1, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END .....C=100, gamma=1, kernel=linear;, score=0.988 total time= 0.0s
[CV 3/5] END .....C=100, gamma=1, kernel=linear;, score=0.951 total time= 0.0s
[CV 4/5] END .....C=100, gamma=1, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END .....C=100, gamma=1, kernel=linear;, score=0.988 total time= 0.1s
[CV 1/5] END ....C=10, gamma=0.1, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END ....C=10, gamma=0.1, kernel=linear;, score=0.988 total time= 0.0s
[CV 3/5] END ....C=10, gamma=0.1, kernel=linear;, score=0.951 total time= 0.0s
[CV 4/5] END ....C=10, gamma=0.1, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END ....C=10, gamma=0.1, kernel=linear;, score=0.988 total time= 0.1s
[CV 1/5] END ....C=1, gamma=0.01, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END ....C=1, gamma=0.01, kernel=linear;, score=0.975 total time= 0.0s
[CV 3/5] END ....C=1, gamma=0.01, kernel=linear;, score=0.963 total time= 0.0s
[CV 4/5] END ....C=1, gamma=0.01, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END ....C=1, gamma=0.01, kernel=linear;, score=0.975 total time= 0.0s
[CV 1/5] END .C=10, gamma=0.0001, kernel=linear;, score=0.951 total time= 0.0s
[CV 2/5] END .C=10, gamma=0.0001, kernel=linear;, score=0.988 total time= 0.0s
[CV 3/5] END .C=10, gamma=0.0001, kernel=linear;, score=0.951 total time= 0.0s
[CV 4/5] END .C=10, gamma=0.0001, kernel=linear;, score=0.975 total time= 0.0s
[CV 5/5] END .C=10, gamma=0.0001, kernel=linear;, score=0.988 total time= 0.1s
{'kernel': 'linear', 'gamma': 0.001, 'C': 0.1}
SVC(C=0.1, gamma=0.001, kernel='linear')
precision recall f1-score support
0 0.94 0.94 0.94 16
10 0.98 0.94 0.96 63
20 0.88 1.00 0.94 23
accuracy 0.95 102
macro avg 0.94 0.96 0.95 102
weighted avg 0.95 0.95 0.95 102
d = make_confusion_matrix(y_test , rand_predictions)
scoring(d)
<Figure size 640x480 with 0 Axes>
Precision: 0.94 Recall: 0.94 F1 Score: 0.94 micro average: 1.088235294117647 weighted_average: 0.9484247967479675 macro average: 0.9484247967479675 accuract: 0.7254901960784313
from sklearn.model_selection import RandomizedSearchCV
param_grid = {'C': [0.1, 1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'kernel': ['rbf']}
rand = RandomizedSearchCV(SVC(), param_grid, refit = True, verbose = 3)
rand.fit(X_train, y_train)
from sklearn.metrics import classification_report
print(rand.best_params_)
print(rand.best_estimator_)
rand_predictions = rand.predict(X_test)
print(classification_report(y_test, rand_predictions))
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 2/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 3/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 4/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 5/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.600 total time= 0.0s
[CV 1/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.815 total time= 0.0s
[CV 2/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.840 total time= 0.0s
[CV 3/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.840 total time= 0.0s
[CV 4/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.840 total time= 0.0s
[CV 5/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.800 total time= 0.0s
[CV 1/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.605 total time= 0.0s
[CV 2/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.605 total time= 0.0s
[CV 3/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.617 total time= 0.0s
[CV 4/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.630 total time= 0.0s
[CV 5/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.625 total time= 0.0s
[CV 1/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=0.704 total time= 0.0s
[CV 2/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=0.704 total time= 0.0s
[CV 3/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=0.790 total time= 0.0s
[CV 4/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=0.815 total time= 0.0s
[CV 5/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=0.787 total time= 0.0s
[CV 1/5] END ......C=10, gamma=0.01, kernel=rbf;, score=0.741 total time= 0.0s
[CV 2/5] END ......C=10, gamma=0.01, kernel=rbf;, score=0.790 total time= 0.0s
[CV 3/5] END ......C=10, gamma=0.01, kernel=rbf;, score=0.728 total time= 0.0s
[CV 4/5] END ......C=10, gamma=0.01, kernel=rbf;, score=0.741 total time= 0.0s
[CV 5/5] END ......C=10, gamma=0.01, kernel=rbf;, score=0.750 total time= 0.0s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.593 total time= 0.0s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.593 total time= 0.0s
[CV 3/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.593 total time= 0.0s
[CV 4/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.593 total time= 0.0s
[CV 5/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.600 total time= 0.0s
[CV 1/5] END ........C=100, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 2/5] END ........C=100, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 3/5] END ........C=100, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 4/5] END ........C=100, gamma=1, kernel=rbf;, score=0.593 total time= 0.0s
[CV 5/5] END ........C=100, gamma=1, kernel=rbf;, score=0.600 total time= 0.0s
[CV 1/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.716 total time= 0.0s
[CV 2/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.741 total time= 0.0s
[CV 3/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.716 total time= 0.0s
[CV 4/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.716 total time= 0.0s
[CV 5/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.725 total time= 0.0s
[CV 1/5] END ....C=100, gamma=0.001, kernel=rbf;, score=0.827 total time= 0.0s
[CV 2/5] END ....C=100, gamma=0.001, kernel=rbf;, score=0.840 total time= 0.0s
[CV 3/5] END ....C=100, gamma=0.001, kernel=rbf;, score=0.852 total time= 0.0s
[CV 4/5] END ....C=100, gamma=0.001, kernel=rbf;, score=0.926 total time= 0.0s
[CV 5/5] END ....C=100, gamma=0.001, kernel=rbf;, score=0.875 total time= 0.0s
[CV 1/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.605 total time= 0.0s
[CV 2/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.605 total time= 0.0s
[CV 3/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.617 total time= 0.0s
[CV 4/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.630 total time= 0.0s
[CV 5/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.625 total time= 0.0s
{'kernel': 'rbf', 'gamma': 0.001, 'C': 100}
SVC(C=100, gamma=0.001)
precision recall f1-score support
0 0.79 0.69 0.73 16
10 0.89 0.90 0.90 63
20 0.88 0.91 0.89 23
accuracy 0.87 102
macro avg 0.85 0.84 0.84 102
weighted avg 0.87 0.87 0.87 102
e = make_confusion_matrix(y_test , rand_predictions)
scoring(e)
<Figure size 640x480 with 0 Axes>
Precision: 0.69 Recall: 0.79 F1 Score: 0.73 micro average: 1.0 weighted_average: 0.8226666666666667 macro average: 0.8226666666666667 accuract: 0.6666666666666666
Ploynomial Regression¶
To extend the simple linear regression to polynomial regression, we can model the relationship between the independent variable $x$ and the dependent variable $y$ as a polynomial function of degree $n$:
$$f(x) = \beta_0 + \beta_1x + \beta_2x^2 + \ldots + \beta_nx^n$$
The steps to find the parameters $\beta_i$ are similar to those in simple linear regression. We again minimize the RSS function by taking the derivatives with respect to each parameter and setting them to 0.
- Step 1: Compute the RSS function for polynomial regression:
$$ RSS = \Sigma (y_i - (\hat{\beta_0} + \hat{\beta_1}x_i + \hat{\beta_2}x_i^2 + \ldots + \hat{\beta_n}x_i^n))^2 $$
- Step 2: Compute the derivatives of the RSS function with respect to each parameter $\beta_i$ and set them to 0 to find the desired parameters.
$$ \frac{\partial RSS}{\partial \beta_i} = 0, \text{ for } i = 0, 1, 2, \ldots, n$$
Solving these equations will give us the optimal values of $\beta_i$ for the polynomial regression model. The specific form of the equations will depend on the degree of the polynomial and the number of parameters.
The general form for finding the coefficients for polynomial regression can be represented as:
$$ \beta = (X^T X)^{-1} X^T y $$
where:
- $X$ is the design matrix with columns $x^0, x^1, x^2, ..., x^n$
- $x^i$ represents the feature vector of $x$ raised to the power of $i$
- $y$ is the target variable vector
- $\beta$ is the coefficient vector for the polynomial regression
By solving for $\beta$ using the above formula, we can obtain the coefficients for the polynomial regression model.
def polynomial_regression(x, y, degree):
pass
#TO DO
Computing the Derivative¶
As we saw, the cost function is the sum over the data points of the squared difference between an observed output and a predicted output.
Since the derivative of a sum is the sum of the derivatives, we can compute the derivative for a single data point and then sum over data points. We can write the squared difference between the observed output and predicted output for a single point as follows:
$$ (output - (const* w _{0} + [feature_1] * w_{1} + ...+ [feature_n] * w_{n} ))^2 $$
With n feautures and a const , So the derivative will be :
$$ 2 * (output - (const* w _{0} + [feature_1] * w_{1} + ...+ [feature_n] * w_{n} )) $$
The term inside the paranethesis is just the error (difference between prediction and output). So we can re-write this as:
$$2 * error*[feature_i] $$
That is, the derivative for the weight for feature i is the sum (over data points) of 2 times the product of the error and the feature itself. In the case of the constant then this is just twice the sum of the errors!
Recall that twice the sum of the product of two vectors is just twice the dot product of the two vectors. Therefore the derivative for the weight for feature_i is just two times the dot product between the values of feature_i and the current errors.
With this in mind, complete the following derivative function which computes the derivative of the weight given the value of the feature (over all data points) and the errors (over all data points).
def feature_derivative(errors, feature):
#TO DO
pass
Gradient Descent¶
Now we will write a function that performs a gradient descent. The basic premise is simple. Given a starting point we update the current weights by moving in the negative gradient direction. Recall that the gradient is the direction of increase and therefore the negative gradient is the direction of decrease and we're trying to minimize a cost function.
The amount by which we move in the negative gradient direction is called the 'step size'. We stop when we are 'sufficiently close' to the optimum. We define this by requiring that the magnitude (length) of the gradient vector to be smaller than a fixed 'tolerance'.
With this in mind, complete the following gradient descent function below using your derivative function above. For each step in the gradient descent we update the weight for each feature befofe computing our stopping criteria.
# Utility functions for multiple regression
def normalize_features(chosen_features, data_frame):
for feature in chosen_features:
data_frame[feature] = (data_frame[feature] - data_frame[feature].mean()) / data_frame[feature].std()
return data_frame
def predict_output(feature_matrix, weights, bias):
#TO DO FOR POLYNOMIAL REGRESSION PREDICTION
return predictions
Polynomial Regression Using Gradient Descent¶
Polynomial regression using gradient descent involves finding the optimal parameters for a polynomial model by iteratively updating them based on the gradient of a loss function, typically the Mean Squared Error (MSE). The steps involved are as follows:
Step 1: Define the polynomial model The polynomial model has the form: $$f(x) = \beta_0 + \beta_1x + \beta_2x^2 + \ldots + \beta_nx^n$$
Step 2: Define the loss function The loss function, such as Mean Squared Error (MSE), measures the error between the actual target values and the predicted values by the model.
Step 3: Initialize the coefficients Start with initial guesses for the coefficients $\beta_0, \beta_1, \ldots, \beta_n$
Step 4: Update the coefficients using Gradient Descent Iteratively update the coefficients to minimize the loss function. This is done by computing the gradient of the loss function with respect to each coefficient and making small adjustments in the opposite direction of the gradient.
Step 5: Repeat until convergence Continue updating the coefficients iteratively until the algorithm converges to the optimal values.
Step 6: Use the learned coefficients for prediction Once the coefficients converge, they can be used in the polynomial function to make predictions on new data points.
Overall, polynomial regression using gradient descent is an iterative optimization process that aims to find the best-fitting polynomial curve to the data points by minimizing the prediction errors. The learning rate and the number of iterations are key hyperparameters to tune for efficient convergence and accurate modeling.
def polynomial_regression_gradient_descent(feature_matrix, outputs, initial_weights,bias, step_size, tolerance):
weights = np.array(initial_weights)
while True:
# Compute predictions using polynomial function and errors
#TO DO
# Compute derivatives for all weights
#TO DO
# Update weights and bias
#TO DO
# Check convergence
#TO DO
return weights, bias
def run_polynomial_regression(chosen_feature_matrix, target_matrix, keywords):
initial_weights = keywords['initial_weights']
step_size = keywords['step_size']
tolerance = keywords['tolerance']
bias = keywords['bias']
weights = np.array(initial_weights)
weights, bias = polynomial_regression_gradient_descent(chosen_feature_matrix, target_matrix, weights, bias, step_size, tolerance)
return weights, bias
def get_weights_and_bias(chosen_features):
keywords = {
'initial_weights': np.array([.5]*len(chosen_features)),
'step_size': 1.e-4,
'tolerance': 1.e-10,
'bias': 0
}
# TO DO
return chosen_feature_matrix, train_weights, bias